# options
options(stringsAsFactors = F)
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
knitr::opts_knit$set(progress = FALSE)
# packages
# qtl mapping + mediation
library(intermediate) # "simecek/intermediate"
# library(intermediate2) # https://github.com/duytpm16/intermediate2
library(qtl2) 
# # plotting
library(plotly)
library(ggpubr)
library(ggraph)
library(pheatmap)
library(cowplot)
library(ggbeeswarm)
library(GGally)
library(corrplot)
# annotations + general genomic things
#library(biomaRt)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
library(org.Mm.eg.db)
library(ChIPseeker)
library(GenomicRanges)
#library(ensimplR) # https://github.com/churchill-lab/ensimplR
library(qvalue)
library(LOLA)  

# data processing
library(pcaMethods) # pca
library(Hmisc) # rcorr
# library(WGCNA)
library(gprofiler2)
# set gprofiler version
set_base_url("http://biit.cs.ut.ee/gprofiler_archive3/e106_eg53_p16/")

# library(sva)
library(WebGestaltR)
library(readxl)
library(tidyverse)
select <- dplyr::select # I am adding this explicitly
rename <- dplyr::rename # I am adding this explicitly
library(downloadthis)
# setting path
library(here)
# get functions
source(here("_src/functions.R")) # source all the common functions



Figure 1A: Overview of data sets

knitr::include_graphics(here("Figure1A.png"))
Figure 1A: Nearly 200 embryonic stem cell lines were established from blastocysts of Diversity Outbred mice, and quantified using ATAC-seq, RNA-seq (Skelly et al., 2020), and multiplexed mass spectrometry; 163 lines have all three measures.

Figure 1A: Nearly 200 embryonic stem cell lines were established from blastocysts of Diversity Outbred mice, and quantified using ATAC-seq, RNA-seq (Skelly et al., 2020), and multiplexed mass spectrometry; 163 lines have all three measures.


Table S1.Quantitative proteomic analysis of DO mESCs.

Table containing normalized and filtered protein abundances (n = 7,432) across individual DO mESClines (n = 190) and DO mESC line details with experimental covariates sex and genotype at the Lifr locus.


Annotations for all proteins in our data set can be downloaded using the link below.

list(all.prots) %>% 
  downloadthis::download_this(
    output_name = "Protein annotations",
    output_extension = ".xlsx",
    button_label = "Download protein annotations as xlsx",
    button_type = "primary",
    has_icon = TRUE,
    icon = "fa fa-save"
  )



Figure 1B: Detection bias in proteomics

# get the list of protein coding genes with transcript abundance in DO mESCs
all.prot_coding.genes <-  all.genes %>% 
  filter( gene_biotype =="protein_coding")

# get average transcript abundance for all protein coding genes
mrna <- expr.esc_rna %>%
  as_tibble( rownames = "sampleid") %>%
  select( all.prot_coding.genes$ensembl_gene_id) %>% 
  summarize_all(.funs = mean, na.rm = T) %>% 
  pivot_longer( all.prot_coding.genes$ensembl_gene_id, 
                names_to = "ensembl_gene_id", 
                values_to = "avg_rna")

# annotate if the gene is found in the protein data by setting prot = TRUE/FALSE
# add average transcript abundance 
all.prot_coding.genes%>% 
  select(ensembl_gene_id, mgi_symbol,ensembl_gene_id) %>%
  mutate(prot = ifelse( (ensembl_gene_id %in% all.prots$ensembl_gene_id |
                           mgi_symbol %in% all.prots$mgi_symbol), TRUE, FALSE)) %>%
  left_join(., mrna) -> prop.data


  
  
# calculate the probability of a protein being present given its mRNA abundance
get_props <- function(dat) {
  prop.table <- c()
  for (i in c(1, 5, 10, 25, 50, 75, 100, 150, 200, 250, 300, 400, 500, 600, 700, 800, 800, 1000, 2000, 5000, 10000, 50000, 1e05)) {
    prop <- dat %>%
      mutate(mRNA = ifelse(avg_rna > i, "at least", "less than")) %>%
      count(mRNA, prot) %>%
      spread(prot, n) %>%
      mutate( `FALSE` = ifelse( is.na(`FALSE`), 0, `FALSE`),
              `TRUE` = ifelse( is.na(`TRUE`), 0, `TRUE`)) %>% # replace NAs in case there aren't any detected/undetected proteins
      filter( (`TRUE`+`FALSE` > 5)) %>% # filter to make sure each bin has at least 5 genes. 
      mutate(detection_probability = `TRUE` / (`FALSE` + `TRUE`))
    prop.temp <- prop %>%
      filter(mRNA == "at least") %>%
      select(detection_probability) %>%
      mutate(avg_rna = i)
    prop.table <- rbind(prop.table, prop.temp)
  }
  return(prop.table)
}

Figure1b_data <- get_props(prop.data) %>% 
  rename( `Probability of protein detection` = detection_probability,
          `Average transcript abundance` = avg_rna)
prob_plot <- Figure1b_data %>% 
  ggplot() +
  aes(x = `Average transcript abundance`, 
      y = `Probability of protein detection`) +
  geom_point(size = 4) +
  scale_x_log10( expand= expansion( mult = c(.1, .12))) +
  theme_pubclean(base_size = 12) +
  geom_line() +
  ylab("Probability of protein detection") +
  xlab("Average transcript abundance") +
  ylim(0.5, .95)+
  theme(
    axis.text = element_text( size = 12),
    axis.title = element_text( size = 12)
  )

ggarrange(prob_plot, 
          labels = "B", 
          font.label = list( size = 20))
Figure 1B: Protein detection rate is linked to transcript abundance. The probability of a gene to have protein abundance measurement given its average transcript abundance among 174 mESCs with both transcriptome and proteome data.

Figure 1B: Protein detection rate is linked to transcript abundance. The probability of a gene to have protein abundance measurement given its average transcript abundance among 174 mESCs with both transcriptome and proteome data.


Figure1b_data %>% 
  mutate_if( is.numeric, round ,2) %>% 
  create_dt()

Data used in Figure 1B.


Table S2. Over-representation analysis of detected and undetected proteins.

Over-represented biological processes and pathways in proteins detected in all samples, and protein coding genes with transcript abundance lacking protein abundance (undetected proteins) are listed. Details include the data source, term id, term name, term size, the number of intersecting proteins and the FDR for each term identified to be overrepresented in detected or undetected protein coding genes.

# get proteins expressed in all samples
expr.esc_prot.comp <- expr.esc_prot %>%
  as.data.frame() %>%
  select( all_of(all.prots$protein_id))  %>% 
  select_if(~ !any(is.na(.)))

# get the gene/protein details for proteins measured in ALL the samples
prots.detected <- all.prots %>% 
  filter(protein_id %in% colnames(expr.esc_prot.comp),
         gene_biotype== "protein_coding")

# over-representation analysis for proteins detected in all samples vs all proteins
g.prot.detected <- gost(
  query = prots.detected$mgi_symbol,
  organism = "mmusculus",
  domain_scope = "custom",
  custom_bg = all.prots$mgi_symbol,
  evcodes = TRUE,
  correction_method = "fdr"
)
g.prot.detected$result <- g.prot.detected$result %>% 
  filter(term_size < 600)

g.prot.detected$result %>% 
  select(
    `Data source` = source,
    `Term ID` = term_id,
    `Term Name` = term_name, 
    `Term size` = term_size, 
    `# of intersecting proteins` = intersection_size,
     FDR = p_value
    ) -> tables2_sheet1

# get proteins not detected although RNA is detected
all.genes %>% 
  filter( !ensembl_gene_id %in% all.prots$ensembl_gene_id & # not in protein data
            gene_biotype == "protein_coding") -> not.detected 

all.prot_coding.genes <-  all.genes %>% 
  filter( gene_biotype =="protein_coding")

g.prot.not.detected <- gost(
  query = not.detected$mgi_symbol,
  organism = "mmusculus",
  domain_scope = "custom",
  custom_bg = all.prot_coding.genes$mgi_symbol, 
  evcodes = TRUE,
  correction_method = "fdr"
)
g.prot.not.detected$result <- g.prot.not.detected$result %>% filter(term_size < 600)

g.prot.not.detected$result %>% 
  select(
    `Data source` = source,
    `Term ID` = term_id,
    `Term Name` = term_name, 
    `Term size` = term_size, 
    `# of intersecting proteins` = intersection_size,
     FDR = p_value
    ) -> tables2_sheet2



Figure S1A-C: Transcription factors show lower transcript and protein abundance than other genes

rna_detect_plot <- apply(na.omit(expr.esc_rna), 2, mean) %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  left_join(., all.genes, by = c("rowname" = "ensembl_gene_id")) %>%
  filter(gene_biotype %in% c("protein_coding")) %>%
  mutate(detected = ifelse(rowname %in% all.prots$ensembl_gene_id, "Detected", "Not detected")) %>%
  ggplot() +
  aes(x = detected, y = `.`) +
  geom_boxplot(width = 0.2) +
  ylab("Average transcript abundance") +
  scale_y_log10( expand = expansion(mult =c(0.4,0.2))) +
  theme_pubclean(base_size = 18) +
  #ggtitle("Protein coding genes") +
  xlab("") +
  stat_compare_means(method = "anova",label.y=7.5)+
  stat_compare_means(method = "t.test",label.y=6.5)
  

tf_rna_plot <- apply(na.omit(expr.esc_rna), 2, mean) %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  left_join(., all.genes, by = c("rowname" = "ensembl_gene_id")) %>%
    filter(gene_biotype %in% c("protein_coding")) %>%
  mutate(is_tf = ifelse( mgi_symbol %in% all.tfs$mgi_symbol, "TF", "Not a TF")) %>%
  ggplot() +
  aes(x = is_tf, y = `.`) +
  geom_boxplot(width = 0.2) +
  ylab("Average transcript abundance") +
  scale_y_log10( expand = expansion(mult =c(0.4,0.2))) +
  theme_pubclean(base_size = 18) +
  #ggtitle("Protein coding genes") +
  xlab("") +
  stat_compare_means(method = "anova",label.y=7.5)+
  stat_compare_means(method = "t.test",label.y=6.5)

tf_prot_plot <- apply((expr.esc_prot[,all.prots$protein_id]), 2, mean, na.rm=TRUE) %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  left_join( ., all.prots, by = c("rowname" = "protein_id")) %>% 
  mutate(is_tf = ifelse( mgi_symbol %in% all.tfs$mgi_symbol, "TF", "Not a TF")) %>%
  ggplot() +
  aes(x = is_tf, y = `.`) +
  geom_boxplot(width = 0.2) +
  ylab("Average protein abundance") +
  theme_pubclean(base_size = 18) +
  #ggtitle("Protein coding genes") +
  xlab("") +
  scale_y_continuous( expand =expansion(mult =c(0.3,0.2))) +
  stat_compare_means(method = "anova", label.y=17)+
  stat_compare_means(method = "t.test",label.y=15.5)



detection_plot <- ggarrange(rna_detect_plot, 
                            tf_rna_plot,tf_prot_plot, 
                            nrow = 1, 
                            widths = c(0.7,0.7,0.7), 
                            labels = c("A","B","C"),
                            font.label = list( size = 20))

detection_plot
FigureS1: (A) Genes where protein abundance is detected have a significantly higher mean transcript abundance (One way ANOVA, followed by t-test). Average transcript abundance of protein coding genes (n = 12,732) that are detected (TRUE, n = 7,240) and not detected (FALSE, n = 5,492) in the proteomics data are plotted. (B, C) TFs show a significantly lower mean for both transcript and protein abundance in comparison to other genes (One way ANOVA, followed by t-test). Average transcript and protein abundance of protein coding genes that are transcription factors (TF) and not transcription factors (Not a TF) are plotted.

FigureS1: (A) Genes where protein abundance is detected have a significantly higher mean transcript abundance (One way ANOVA, followed by t-test). Average transcript abundance of protein coding genes (n = 12,732) that are detected (TRUE, n = 7,240) and not detected (FALSE, n = 5,492) in the proteomics data are plotted. (B, C) TFs show a significantly lower mean for both transcript and protein abundance in comparison to other genes (One way ANOVA, followed by t-test). Average transcript and protein abundance of protein coding genes that are transcription factors (TF) and not transcription factors (Not a TF) are plotted.



Figure 1C: Principal component analysis of the pluripotent proteome

# principal component analysis using pcaMethods::pca function.
pca.prot <- pca(object = expr.esc_prot, 
                            method = "svdImpute", 
                            scale = "uv", 
                            nPcs = 10, 
                            center = TRUE
                            )

# get values for Principal components
Figure1c_data <- scores(pca.prot) %>%
  as_tibble( rownames ="sampleid") %>% 
  left_join(covarTidy.esc_prot) %>%
  mutate(sex = ifelse(sex == "F", "Female", "Male")) %>%
  rename(
    `Sample ID` = sampleid, 
    Sex = sex
  ) 
Figure1c_data %>% 
  ggplot() +
  aes(x = PC1, y = PC2, col = Sex) +
  geom_point(size = 4, alpha = 0.7) +
  theme(
    axis.text = element_text(size = 18),
    axis.title = element_text(size = 20), 
    legend.text = element_text(size = 16), legend.title = element_text(size = 16)
  ) +
  xlab(paste0("PC1 (", round(pca.prot@R2[1], 3) * 100, "%)")) +
  ylab(paste0("PC2 (", round(pca.prot@R2[2], 3) * 100, "%)")) +
  theme_pubclean(base_size = 18) + 
  color_palette("npg")+
  fill_palette("npg")+
  xlim(-100,150)+
  ylim(-100,150)+
  facet_wrap(~Sex, strip.position = "right")+
  theme(
    strip.background = element_blank(),
    strip.text.x = element_blank()
  ) -> pca_plot

ggarrange(pca_plot, 
          labels = "C",
          font.label = list( size = 20))
Figure 1C: Principal component analysis reveals sex as a significant source of variation among DO mESC proteomes. PC1 and PC2 for 190 mESCs are plotted and colored by sex.

Figure 1C: Principal component analysis reveals sex as a significant source of variation among DO mESC proteomes. PC1 and PC2 for 190 mESCs are plotted and colored by sex.


The data used in plotting Figure1C can be downloaded using the link below.

list( Figure1c_data %>% 
        select(`Sample ID`, Sex, PC1, PC2)) %>% 
  downloadthis::download_this(
    output_name = "Figure1C data",
    output_extension = ".xlsx",
    button_label = "Download Figure1C data as xlsx",
    button_type = "primary",
    has_icon = TRUE,
    icon = "fa fa-save"
  )


Drivers of PC1

# get top drivers of PC1
loadings(pca.prot) %>%
  as_tibble( rownames = "protein_id") %>% 
  left_join( all.prots) %>% 
  select(mgi_symbol, gene_chr, PC1) %>%
  filter( abs(PC1) >= quantile(abs(PC1), 0.90))-> pc1.loadings # most contributing 10%

# PC1 drivers by chromosome

pc1.loadings %>% 
  group_by(gene_chr) %>%
  # count() %>%
  # arrange(desc(n)) %>%
  ggplot()+
  aes( x = gene_chr)+
  geom_bar()+
  theme_pubclean( base_size = 18)+
  xlab("Chr")
Chromosomal locations of the top 10% of proteins that contribute to PC1.

Chromosomal locations of the top 10% of proteins that contribute to PC1.


g.pc1 <- gost(query = pc1.loadings$mgi_symbol, 
              organism = "mmusculus", 
              domain_scope = "custom", 
              custom_bg = all.prots$mgi_symbol, 
              evcodes = TRUE,
              correction_method = "fdr")
g.pc1$result <- g.pc1$result %>% filter(term_size < 600)

g.pc1$result %>% 
   select(
    `Data source` = source,
    `Term ID` = term_id,
    `Term Name` = term_name, 
    `Term size` = term_size, 
    `# of intersecting proteins` = intersection_size,
     FDR = p_value
    ) %>%
  mutate_if( is.numeric, formatC, digits =2) %>% 
  create_dt()

Over-represented biological processes and pathways in PC1 drivers.


Figure S1D-E: Variation in protein abundance

expr.esc_prot %>%
  as_tibble(.) %>%
  summarise_all(list(~ mean(., na.rm = T))) %>% 
  pivot_longer( all.prots$protein_id,
                names_to = "protein_id",
                values_to ="mean.prot") %>% 
  ggplot() +
  aes(x = mean.prot) +
  geom_histogram(binwidth = 0.1) +
  xlab("Mean") +
  theme_pubclean(base_size = 18) -> p.mean.hist

expr.esc_prot %>%
  as_tibble(.) %>%
  summarise_all(list(~ var(., na.rm = T))) %>% 
  pivot_longer( all.prots$protein_id,
                names_to = "protein_id",
                values_to ="var") %>% 
  ggplot() +
  aes(x = var) +
  geom_histogram(binwidth = 0.01) +
  xlab("Variance") +
  theme_pubclean(base_size = 18) +
  scale_x_log10() -> p.var.hist

ggarrange(p.mean.hist, 
          p.var.hist, 
          nrow =1, 
          labels = c("D","E"),
          font.label = list( size = 18))
(D, E) Protein abundance is highly variable across DO mESCs. Histograms showing the mean abundance and variance per protein plotted for 7,342 proteins across 190 DO mESC lines.

(D, E) Protein abundance is highly variable across DO mESCs. Histograms showing the mean abundance and variance per protein plotted for 7,342 proteins across 190 DO mESC lines.


Sex effects on protein abundance

# updating the code to use anova followed by tukey's hsd:
expr.esc_prot %>%
  t() %>% 
  as_tibble(rownames = "protein_id") %>%
  filter( protein_id %in% all.prots$protein_id) %>% 
  pivot_longer( cols = rownames(expr.esc_prot),
                values_to = "protein_ab",
                names_to = "sampleid") %>% 
  left_join(., select(covarTidy.esc_prot, sampleid, sex)) %>% 
  group_by(protein_id) %>% 
  rstatix::anova_test( protein_ab ~ sex) %>% 
  rstatix::adjust_pvalue( method = "BH") %>% 
  rstatix::add_significance("p.adj") %>% 
  as_tibble() -> prot_sex_aov

# passing the full data to tukey's then filtering
expr.esc_prot %>%
  t() %>% 
  as_tibble(rownames = "protein_id") %>% 
  filter( protein_id %in% all.prots$protein_id) %>% 
  pivot_longer( cols = rownames(expr.esc_prot),
                values_to = "protein_ab",
                names_to = "sampleid") %>% 
  left_join(., select(covarTidy.esc_prot, sampleid, sex)) %>% 
  group_by(protein_id) %>% 
  rstatix::tukey_hsd(protein_ab ~ sex) %>% 
  filter( protein_id %in% (filter(prot_sex_aov, p.adj.signif != "ns"))$protein_id ) -> prot_sex_tukeys


# get the medians for later
expr.esc_prot %>%
  t() %>% 
  as_tibble(rownames = "protein_id") %>%
  filter( protein_id %in% all.prots$protein_id) %>% 
  pivot_longer( cols = rownames(expr.esc_prot),
                values_to = "protein_ab",
                names_to = "sampleid") %>% 
  left_join(., select(covarTidy.esc_prot, sampleid, sex)) %>% 
  group_by(protein_id,sex) %>% 
  summarize( med = median(protein_ab, na.rm =T)) %>% 
  pivot_wider( id_cols = "protein_id",
               names_from = "sex",
               values_from = "med")-> prot_sex_med
prot_sex_tukeys %>%
  left_join( all.prots) %>% 
  left_join( prot_sex_med) %>% 
  arrange(p.adj) %>%
  mutate_if( is.numeric, round, 2) %>%
  select(
    `Protein ID` = protein_id,
    `MGI Symbol`= mgi_symbol, 
    `Protein location (chr)` = gene_chr,
    `Female median`=`F`,
    `Male median`= M
   ) %>%
  create_dt()

Table of proteins showing a significant sex effect.



Figure 1D: Geneset variation analysis of the pluripotent proteome

library(GSVA)
library(GO.db)

# Preparing gene sets from GO
# reading in the GO + mgi downloaded from: http://www.informatics.jax.org/gotools/data/input/MGIgenes_by_GOid.txt
go_terms <- read_tsv( "http://www.informatics.jax.org/gotools/data/input/MGIgenes_by_GOid.txt") %>% 
  mutate( genes = str_split(genes, ",")) %>% 
  unnest() # separete the symbols, note the overlap: length(intersect(unique(go_terms$genes), all.prots$mgi_symbol) ) = 6757

slim_go_terms <- read_tsv( "http://www.informatics.jax.org/gotools/data/input/map2MGIslim.txt") %>% 
  select(-term) %>% 
  mutate( ONT = case_when( aspect == "P" ~  "BP",
                     aspect == "F" ~ "MF",
                     aspect == "C" ~ "CC"
                     )
          ) %>% 
  select(-aspect)

genesbygo <- split(go_terms$genes, go_terms$GO_id)

go_terms_annot <- go_terms %>%  
  select(GO_id) %>% 
  distinct() %>% 
  left_join( slim_go_terms %>%  select( GO_id, ONT) %>% distinct())

goannot_wdef <- AnnotationDbi::select(GO.db, keys= unique(go_terms$GO_id), columns=c("GOID","DEFINITION","ONTOLOGY","TERM")) %>%
  left_join( slim_go_terms, by=c("GOID"="GO_id")) %>% 
  mutate( ONTOLOGY = ONT) %>% 
  select(-ONT)

go_bp <- goannot_wdef %>% filter( ONTOLOGY == "BP") %>% 
  select(GOID) %>%  distinct()

# Run GSVA using protein abundance
expr.esc_prot_upd <- expr.esc_prot[, all.prots$protein_id]
colnames(expr.esc_prot_upd) <- all.prots$mgi_symbol
gsva_prot <- gsva(  expr = t(expr.esc_prot_upd),
                    genesbygo,
                    method ="gsva",
                    kcdf = "none",
                    min.sz = 5, 
                    max.sz = 1000,
                    mx.diff = TRUE)

# Run GSVA with complexes to complement the co-abundance analysis
genes_for_complex_gsva <- complex.genes %>%  
  enframe( "Complex Name","human_ids") %>%
  unnest(human_ids) %>% 
  left_join( complex.gene.list) %>% 
  filter( !is.na(protein_id)) %>% 
  select( `Complex Name` , protein_id)
gsva_complexes <- unique(genes_for_complex_gsva$`Complex Name`)
genes_by_complex <- split(genes_for_complex_gsva$protein_id, genes_for_complex_gsva$`Complex Name`)
gsva_prot_comp <-  gsva(  expr = t(expr.esc_prot[,all.prots$protein_id]),
                    genes_by_complex,
                    method ="gsva",
                    kcdf = "none",
                    min.sz = 5, 
                    max.sz = 1000,
                    mx.diff = TRUE)

# Run GSVA using transcript abundance
expr.esc_rna_upd <- expr.esc_rna[, all.genes$ensembl_gene_id]
colnames(expr.esc_rna_upd) <- all.genes$mgi_symbol
gsva_rna <- gsva(  expr = t(expr.esc_rna_upd),
                    genesbygo,
                    method ="gsva",
                    kcdf = "none",
                    min.sz = 5, 
                    max.sz = 1000,
                    mx.diff = TRUE)

# Annotation and statistical follow up using ANOVA + Tukey's on Protein results
covar.lifr  %>% 
  rename( top_muga = rowname ) %>% 
  left_join(sample.matches.all) %>% 
  select(sampleid, lifr_geno) %>% 
  inner_join(covarTidy.esc_prot) -> covar_lifr_upd

gsva_prot %>% 
  as_tibble(rownames = "Category") %>% 
  filter( Category %in% go_bp$GOID) %>% #filtering for BP
  rbind( as_tibble(gsva_prot_comp, rownames = "Category" )) %>% 
  # rbind( as_tibble(gsva_prot3, rownames = "Category" )) %>% 
  # rbind( as_tibble(gsva_prot4, rownames = "Category")) %>% 
  pivot_longer( cols = rownames(expr.esc_prot_upd),
                values_to = "Enrichment_Score",
                names_to = "sampleid") %>% 
  # add sexes + lifr genotypes
  left_join( covar_lifr_upd) -> gsva_results


gsva_results %>% 
  group_by( Category) %>% 
  rstatix::anova_test( Enrichment_Score ~ sex+lifr_geno+sex*lifr_geno) %>% 
  rstatix::adjust_pvalue(method = "BH") %>%
  rstatix::add_significance("p.adj") %>% 
  ungroup() -> aov_results 

aov_results %>% 
  as_tibble() %>% 
  filter( p.adj.signif != "ns" ) -> signif_eff_terms

# passing all to Tukey's with the full model
gsva_results %>% 
  group_by(Category) %>% 
  rstatix::tukey_hsd( Enrichment_Score ~ sex+lifr_geno+sex:lifr_geno) %>% 
  ungroup() %>% 
  as_tibble() %>% 
  left_join( goannot_wdef, by = c("Category" = "GOID")) -> results_tukey

# filtering for the ones that were significant in ANOVA + Tukey's
results_tukey %>% 
  filter( p.adj < 0.05) %>% 
  inner_join( ., select( signif_eff_terms, Category, term = Effect)) -> signif_results_tukey

# Annotation and statistical follow up using ANOVA + Tukey's on RNA results
gsva_rna %>% 
  as_tibble(rownames = "Category") %>% 
  filter( Category %in% go_bp$GOID) %>% #filtering for BP
  pivot_longer( cols = rownames(expr.esc_rna_upd),
                values_to = "Enrichment_Score",
                names_to = "sampleid") %>% 
  # add sexes + lifr genotypes
  left_join( covar_lifr_upd) -> gsva_rna_results

gsva_rna_results %>% 
  group_by( Category) %>% 
  rstatix::anova_test( Enrichment_Score ~ sex+lifr_geno+sex*lifr_geno) %>% 
  rstatix::adjust_pvalue( method = "BH") %>%
  rstatix::add_significance("p.adj") %>% 
  ungroup() -> gsva_rna_aov

gsva_rna_results %>% 
  group_by(Category) %>% 
  rstatix::tukey_hsd( Enrichment_Score ~ sex+lifr_geno+sex:lifr_geno) %>% 
  ungroup() %>% 
  as_tibble() %>% 
  left_join( goannot_wdef, by = c("Category" = "GOID")) -> gsva_rna_tukey

gsva_rna_aov %>% 
  as_tibble() %>% 
  filter( p.adj.signif != "ns" ) -> signif_eff_terms_rna

gsva_rna_tukey %>% 
  filter( p.adj < 0.05) %>% 
  inner_join( ., select( signif_eff_terms_rna, Category, term = Effect)) -> signif_results_tukey_rna
Figure1d_data <- gsva_results %>%
  filter( Category %in% c("GO:0006306", # DNA methylation
                          "GO:0006338", # Chromatin remodeling
                          "GO:0042254" # Ribosome biogenesis
                          )) %>%
  left_join( select(goannot_wdef, Category = GOID, TERM)) %>% 
  select( Category, TERM, sampleid, Enrichment_Score, sex, lifr_geno) %>% 
  left_join(
    signif_results_tukey %>% 
      filter( term =="sex") %>% 
      select(Category, p.adj, p.adj.signif )
  ) %>% 
  rbind(
       gsva_results %>%  
         filter( Category == "GO:0006471") %>%  # protein ADP-ribosylation
         left_join( select(goannot_wdef, Category = GOID, TERM)) %>% 
         select( Category, TERM, sampleid, Enrichment_Score, sex, lifr_geno) %>% 
         left_join(
           signif_results_tukey %>% 
             filter( term =="lifr_geno") %>% 
             select(Category, p.adj, p.adj.signif )
           )
       ) %>% 
  mutate( Enrichment_Score = round(Enrichment_Score, 2)) %>% 
  select( `GO ID` = Category, 
          `GO Term`=TERM, 
          Sample = sampleid, 
          `Enrichment Score`=Enrichment_Score, 
          Sex = sex, 
          `Lifr genotype`=lifr_geno, 
          `Significance`=p.adj.signif)
Figure1d_data %>% 
  ggplot()+
  aes( x = Sex,
       y = `Enrichment Score`,
       col = Sex)+
  geom_boxplot(width =0.2, size = 1.1)+
  #geom_jitter()+
  #geom_beeswarm(aes(col = sex))+
  theme_pubclean(base_size = 16)+
  stat_pvalue_manual( filter(signif_results_tukey,Category  == "GO:0006306", term == "sex"),
                      label = "{p.adj.signif}",
                      y.position = 0.85)+
  color_palette("npg")+
  ylab("Enrichment Score")+
  ggtitle("DNA Methylation")+
  xlab("")+
  ylim(-1,1)+
  theme(axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18),
        axis.title = element_text(size =18),
        plot.title = element_text(size =18, hjust = 0.5)) -> p_met

Figure1d_data %>% 
  ggplot()+
  aes( x = Sex,
       y = `Enrichment Score`,
       col = Sex)+
  geom_boxplot(width =0.2, size = 1.1)+
  #geom_jitter()+
  #geom_beeswarm(aes(col = sex))+
  theme_pubclean(base_size = 16)+
  stat_pvalue_manual( filter(signif_results_tukey,Category  == "GO:0006338", term == "sex"),
                      label = "{p.adj.signif}",
                      y.position = 0.85)+
  color_palette("npg")+
  ylab("Enrichment Score")+
  ggtitle("Chromatin remodeling")+
  xlab("")+
  ylim(-1,1)+
  theme(axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18),
        axis.title = element_text(size =18),
        plot.title = element_text(size =18, hjust = .5)) -> p_chrom

Figure1d_data %>% 
  ggplot()+
  aes( x = Sex,
       y = `Enrichment Score`,
       col = Sex)+
  geom_boxplot(width =0.2, size = 1.1)+
  #geom_jitter()+
  #geom_beeswarm(aes(col = sex))+
  theme_pubclean(base_size = 16)+
  stat_pvalue_manual( filter(signif_results_tukey,Category  == "GO:0042254", term == "sex"),
                      label = "{p.adj.signif}",
                      y.position = 0.85)+
  color_palette("npg")+
  ylab("Enrichment Score")+
  ggtitle("Ribosome biogenesis")+
  xlab("")+
  ylim(-1,1)+
  theme(axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18),
        axis.title = element_text(size =18),
        plot.title = element_text(size =18, hjust = .5)) -> p_ribo

gsva_results %>%
  filter( Category ==  "GO:0006471") %>%
  left_join( select(goannot_wdef, Category = GOID, TERM)) %>% 
  rename( `GO ID` = Category, 
          `GO Term`=TERM, 
          Sample = sampleid, 
          `Enrichment Score`=Enrichment_Score, 
          Sex = sex, 
          `Lifr genotype`=lifr_geno) %>% 
  ggplot()+
  aes( x = `Lifr genotype`,
       y =  `Enrichment Score`,
       col = `Lifr genotype`)+
  geom_boxplot(width =0.2, size = 1.1)+
  #geom_jitter()+
  #geom_beeswarm(aes(col = sex))+
  theme_pubclean(base_size = 16)+
  stat_pvalue_manual( filter(signif_results_tukey,Category  == "GO:0006471", term == "lifr_geno"),
                      label = "{p.adj.signif}",
                      y.position = c(0.85, 0.95))+
  color_palette("Dark2")+
  ylab("Enrichment Score")+
  ggtitle("Protein ADP-ribosylation")+
  xlab("")+
  labs(col ="LIFR")+
  ylim(-1,1)+
  theme(legend.position = "none",
        axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18),
        axis.title = element_text(size =18),
        plot.title = element_text(size =18, hjust = .5)) -> p_adp

left <- ggarrange(p_met,p_chrom, p_ribo, 
                  common.legend = TRUE, 
                  nrow = 1,
                  ncol = 3, 
                  legend = "none")

ggarrange(left, p_adp, 
          nrow=1, 
          widths = c(.8,0.4) , 
          labels = c("D"), 
          font.label = list( size = 20))
Figure 1D: Enrichment scores obtained from GSVA for Gene Ontology Biological Processes (GO:BP) showing significant differences between mESCs with different sexes and genotypes at the Lifr locus are plotted. GO:BP DNA methylation, chromatin remodeling and ribosome biogenesis show significantly higher enrichment in males in comparison to females and, protein ADP-ribosylation shows significantly higher enrichment in mESCs with at least one copy of the reference allele in comparison to ones carrying two copies of the alternative allele at the *Lifr* locus (two-way anova followed by Tukey’s HSD, `*: p value < 0.05, ****: p value < 0.00005`).

Figure 1D: Enrichment scores obtained from GSVA for Gene Ontology Biological Processes (GO:BP) showing significant differences between mESCs with different sexes and genotypes at the Lifr locus are plotted. GO:BP DNA methylation, chromatin remodeling and ribosome biogenesis show significantly higher enrichment in males in comparison to females and, protein ADP-ribosylation shows significantly higher enrichment in mESCs with at least one copy of the reference allele in comparison to ones carrying two copies of the alternative allele at the Lifr locus (two-way anova followed by Tukey’s HSD, *: p value < 0.05, ****: p value < 0.00005).


The data used in plotting Figure1D can be downloaded using the link below.

list(Figure1d_data) %>% 
  downloadthis::download_this(
    output_name = "Figure1D data",
    output_extension = ".xlsx",
    button_label = "Download Figure1D data as xlsx",
    button_type = "primary",
    has_icon = TRUE,
    icon = "fa fa-save"
  )


Table S3: Gene Set Variation Analysis results.

Biological processes and protein complexes that show significant differences between experimental groups (sex, Lifr genotype, or their interaction) in GSVA enrichment scores obtained from protein or transcript abundance are listed. The source of the significant effect (sex, Lifr genotype or their interaction) as well as the two groups being compared is included with the Tukey’s HSD estimate and the adjusted p-value for each term.

signif_results_tukey %>% 
  #left_join( ., goannot_wdef %>%  select(TERM, GOID) %>% distinct()) %>% 
  select(Effect= term, Category, TERM,group1, group2, estimate,p.adj) %>% 
  distinct() %>% 
  mutate( estimate= round(estimate,2),
          TERM = ifelse( Category %in% genes_for_complex_gsva$`Complex Name`, "Protein Complex", TERM)) %>% 
  mutate( 
    temp = TERM,
    TERM = ifelse( TERM =="Protein Complex" & !is.na(TERM), Category, TERM ),
    Category = ifelse( temp == "Protein Complex" & !is.na(TERM), temp, Category)
          ) %>%  
  arrange(estimate) %>% 
  select( 
    Effect, 
    `Term ID` = Category,
    `Term Name` = TERM,
    `Group 1`= group1,
    `Group 2`= group2,
    `Tukey's HSD estimate` = estimate, 
    `Adjusted p-value` = p.adj
    ) -> tables3_sheet1


signif_results_tukey_rna %>%
  #left_join( ., goannot_wdef %>%  select(TERM, GOID) %>% distinct()) %>% 
  select(Effect= term, Category, TERM,group1, group2, estimate,p.adj) %>% 
  distinct() %>% 
  mutate( estimate= round(estimate,2),
          TERM = ifelse( Category %in% genes_for_complex_gsva$`Complex Name`, "Protein Complex", TERM)) %>% 
  mutate( 
    temp = TERM,
    TERM = ifelse( TERM =="Protein Complex" & !is.na(TERM), Category, TERM ),
    Category = ifelse( temp == "Protein Complex" & !is.na(TERM), temp, Category)
          ) %>% 
  arrange(estimate) %>% 
  select( 
    Effect, 
    `Term ID` = Category,
    `Term Name` = TERM,
    `Group 1`= group1,
    `Group 2`= group2,
    `Tukey's HSD estimate` = estimate, 
    `Adjusted p-value` = p.adj
    ) -> tables3_sheet2


Over-representation analysis in proteins with high and low variation in abundance

var_prot <- expr.esc_prot %>%
  as_tibble(.) %>%
  summarise_all(list(~ var(., na.rm = T))) %>% 
  pivot_longer( all.prots$protein_id,
                names_to = "protein_id",
                values_to ="var") 

high.var.prots <- var_prot %>%  
  filter( var >= quantile(var, 0.95)) %>% 
  left_join( all.prots %>%  
               select( protein_id, mgi_symbol))

low.var.prots <- var_prot %>% 
  filter( var <= quantile(var, 0.05))%>% 
  left_join( all.prots %>%  
               select( protein_id, mgi_symbol))

g.high.var <- gost(
  query = high.var.prots$mgi_symbol,
  organism = "mmusculus",
  domain_scope = "custom",
  custom_bg = all.prots$mgi_symbol,
  evcodes = TRUE,
  correction_method = "fdr"
)
g.high.var$result <- g.high.var$result %>% filter(term_size < 600)

g.low.var <- gost(
  query = low.var.prots$mgi_symbol,
  organism = "mmusculus",
  domain_scope = "custom",
  custom_bg = all.prots$mgi_symbol,
  evcodes = TRUE,
  correction_method = "fdr"
)
g.low.var$result <- g.low.var$result %>% filter(term_size < 600)

g.high.var$result %>% 
  mutate( `Gene Set` = "High variation") %>% 
  select(`Gene Set`, `Term Name` = term_name, source, FDR = p_value, `Term size` = term_size, Intersection = intersection_size,term_id)  %>% 
  rbind(
    g.low.var$result %>%
      mutate( `Gene Set` = "Low variation") %>% 
       select(`Gene Set`, `Term Name` = term_name, source, FDR = p_value, `Term size` = term_size, Intersection = intersection_size, term_id) 
    ) %>% 
  select(
         `Gene Set`, 
    `Data source` = source,
    `Term ID` = term_id,
    `Term Name` , 
    `Term size` , 
    `# of intersecting proteins` = Intersection,
     FDR 
    ) %>% 
  mutate_if( is.numeric, formatC, digits =2) %>% 
  create_dt()

Over-represented biological processes and pathways in proteins with high (top 5th percentile) and low variation (bottom 5th percentile).


Figure S1F-G

# get proteins identified as part of "extracellular region"
ecm_genes <- tibble(mgi_symbol = unlist(str_split((g.high.var$result %>% filter(term_name %in% c("extracellular region", "extracellular matrix")))$intersection, ","))) %>%
  left_join(., all.prots)

# get proteins identified as REX1 targets 
rex1.genes <- tibble(mgi_symbol = unlist(str_split((g.low.var$result %>% filter(source == "TF"))$intersection[1], ","))) %>%
  left_join(., all.prots)

# get variance + mean by protein into a single data frame
mean_prot <- expr.esc_prot %>%
  as_tibble(.) %>%
  summarise_all(list(~ mean(., na.rm = T))) %>% 
  pivot_longer( all.prots$protein_id,
                names_to = "protein_id",
                values_to ="mean") 

var_mean_prot <- var_prot %>% 
  full_join( mean_prot) %>% 
  left_join( all.prots)
  
var_mean_prot %>% 
  ggscatter(., 
            x = "mean", 
            y = "var", 
            size = 3, 
            alpha = 0.5,
            col="gray",
            yscale = "log10",
            #xscale = "log10",
            show.legend.text = FALSE
            ) +
  xlab("Mean protein abundance") +
  ylab("Variance in protein abundance") +
  ggtitle("REX1 Target Proteins")+
  theme_pubclean(base_size = 18) + 
  rremove("legend") +
  geom_point(
    data =   filter( var_mean_prot, ensembl_gene_id %in% rex1.genes$ensembl_gene_id) ,
    col = "blue", alpha = 0.6, size = 3)+
  geom_point( data = filter(var_mean_prot, mgi_symbol == "Zfp42"), col = "purple", size = 4, alpha = 1)+
  geom_label( data = filter(var_mean_prot, mgi_symbol == "Zfp42") , label = "Rex1", nudge_x = .2, nudge_y = .2)  -> plot_rex1


var_mean_prot %>% 
  ggscatter(., 
            y = "var", 
            x = "mean", 
            size = 3, 
            alpha = 0.5,
            col="gray",
            yscale = "log10",
            #xscale = "log10",
            show.legend.text = FALSE
            ) +
  ylab("Variance in protein abundance") +
  xlab("Mean protein abundance") +
  theme_pubclean(base_size = 18) + 
  rremove("legend") +
  geom_point(
    data =   filter( var_mean_prot, ensembl_gene_id %in% c(ecm_genes$ensembl_gene_id)) ,
    col = "blue", alpha = 0.6, size = 3) +
  ggtitle("Extracellular Matrix Proteins")-> plot_ecm

ggarrange( plot_ecm, plot_rex1, nrow =1 ,
           labels = c("F","G"),
          font.label = list( size = 20)
        )
FigureS1: (F) Mean abundance and variance plotted for all proteins (gray) with proteins identified as part of `Extracellular region` and `ECM protein` GO Terms in most variable proteins (top 5th percentile %CV), in overrepresentation analysis, highlighted in blue. (G) Mean abundance and variance plotted for all proteins with proteins identified as `REX1 Target` in TRANSFAC database in least variable proteins (bottom 5th percentile %CV), in overrepresentation analysis, highlighted in blue and REX1 (Zfp42) highlighted in purple.

FigureS1: (F) Mean abundance and variance plotted for all proteins (gray) with proteins identified as part of Extracellular region and ECM protein GO Terms in most variable proteins (top 5th percentile %CV), in overrepresentation analysis, highlighted in blue. (G) Mean abundance and variance plotted for all proteins with proteins identified as REX1 Target in TRANSFAC database in least variable proteins (bottom 5th percentile %CV), in overrepresentation analysis, highlighted in blue and REX1 (Zfp42) highlighted in purple.

---
title: "Pluripotent Proteome"
output:
  html_document:
    toc: true
    toc_depth: 4
    toc_float: 
      collapsed: false
      smooth_scroll: false
    df_print: paged
    code_folding: hide
---

```{=html}
<style>
p.caption {
  font-size: 1em;
}
</style>
```

```{r load_packages, warning=FALSE, message=FALSE, echo = TRUE}
# options
options(stringsAsFactors = F)
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
knitr::opts_knit$set(progress = FALSE)
# packages
# qtl mapping + mediation
library(intermediate) # "simecek/intermediate"
# library(intermediate2) # https://github.com/duytpm16/intermediate2
library(qtl2) 
# # plotting
library(plotly)
library(ggpubr)
library(ggraph)
library(pheatmap)
library(cowplot)
library(ggbeeswarm)
library(GGally)
library(corrplot)
# annotations + general genomic things
#library(biomaRt)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
library(org.Mm.eg.db)
library(ChIPseeker)
library(GenomicRanges)
#library(ensimplR) # https://github.com/churchill-lab/ensimplR
library(qvalue)
library(LOLA)  

# data processing
library(pcaMethods) # pca
library(Hmisc) # rcorr
# library(WGCNA)
library(gprofiler2)
# set gprofiler version
set_base_url("http://biit.cs.ut.ee/gprofiler_archive3/e106_eg53_p16/")

# library(sva)
library(WebGestaltR)
library(readxl)
library(tidyverse)
select <- dplyr::select # I am adding this explicitly
rename <- dplyr::rename # I am adding this explicitly
library(downloadthis)
# setting path
library(here)
# get functions
source(here("_src/functions.R")) # source all the common functions
```

```{r load_annotate_omics_data, warning=FALSE, message=FALSE, results="hide" , echo =FALSE}
##### Protein data
# Note the ensembl protein id's are from v98
load(here("../pQTL_website/_data/DO_mESC_pQTL_forMapping_noPoly_v3.RData"))
exprZ.esc_prot <- exprZ
probs.esc_prot <- genoprobs
covar.esc_prot <- covar
covarTidy.esc_prot <- covarTidy
kinship_loco.esc_prot <- kinship_loco
# I don't want to use the rankZ normalized version. let's clean up the raw data for use.
raw.expr.esc_prot <- raw.expr
expr.esc_prot <- raw.expr[rownames(raw.expr) %in% rownames(exprZ), colnames(raw.expr) %in% colnames(exprZ)]
expr.esc_prot <- as.matrix(expr.esc_prot)
rm(exprZ, covar, covarTidy, kinship_loco, genoprobs, raw.expr)
##### RNA data
# Note the ensembl gene id's are from v82
# Note that there are 20 ensembl gene id's that don't match anything even in v82.
load(here("../pQTL_website/_data/DO_mESC_paired_eQTL_forMapping.RData"))
raw.expr.esc_rna <- esc.raw.expr
exprZ.esc_rna <- esc.exprZ
kinship_loco.esc_rna <- esc.kinship_loco
probs.esc_rna <- esc.probs
covar.esc_rna <- esc.covar
covarTidy.esc_rna <- covarTidy
exprComBat.esc_rna <- esc.expr.ComBat
expr.esc_rna <- expm1(exprComBat.esc_rna) # re-transforming since the data was log(x+1) before combat
expr.esc_rna[expr.esc_rna < 0] <- 0
expr.esc_rna <- t(expr.esc_rna)
rm(esc.expr, esc.exprZ, esc.kinship_loco, esc.probs, esc.expr.ComBat, esc.raw.expr, covarTidy, exprComBat.esc_rna, esc.covar, esc.covarTidy)
##### ATAC data
# load the atac peaks
load(here("../pQTL_website/_data/DO_atacseq_prelim_DS0420.RData")) # counts.norm
# remove bad samples
counts.norm <- counts.norm[, !colnames(counts.norm) %in% c("PB357.01", "PB357.17")]
covar.esc_atac <- covar.mat[!rownames(covar.mat) %in% c("PB357.01", "PB357.17"), ]
covarTidy.esc_atac <- covar_full[!covar_full$PB_ID %in% c("PB357.01", "PB357.17"), ]
probs.esc_atac <- probs
# note that there are 2 plates of ATAC samples that have been processed. I will use comBat to remove this effect.
dat <- log1p((counts.norm))
mod <- model.matrix(~sex, data = covarTidy.esc_atac)
exprComBat <- sva::ComBat(
  dat = dat, batch = covarTidy.esc_atac$plate, mod = mod,
  par.prior = TRUE, prior.plots = FALSE
)
counts.norm2 <- expm1(exprComBat)
counts.norm2[counts.norm2 < 0] <- 0
covar.esc_atac <- covar.esc_atac[, "sex", drop = FALSE]
counts.normZ <- apply(counts.norm2, 1, rankZ)
kinship_loco.esc_atac <- calc_kinship(probs.esc_atac, type = "loco")
rm(probs, covar_full, covar.mat, map_dat, dat, mod, exprComBat, counts.norm)
### Annotating using ChipSeeker + TxDb.Mmusculus.UCSC.mm10.knownGene (this uses UCSC data from Oct, 2019 which I assume matches to ensembl v98 (Sep19))
# annotate Atac data
# prep & annotate data
atac.counts <- counts.norm2 %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  separate(rowname, into = c("Chr", "Start", "End"), sep = "_", remove = FALSE) %>%
  mutate(Chr = gsub("peak", "", Chr)) %>%
  column_to_rownames()
gr <- makeGRangesFromDataFrame(atac.counts, 
                               keep.extra.columns = T, 
                               seqnames.field = c("Chr"), 
                               start.field = "Start", 
                               end.field = "End")
gr <- diffloop::addchr(gr)
peakAnno <- annotatePeak(gr,
  TxDb = TxDb.Mmusculus.UCSC.mm10.knownGene, annoDb = "org.Mm.eg.db",
  tssRegion = c(-5000, 5000),
  level = "transcript", assignGenomicAnnotation = TRUE,
  genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon", "Intron", "Downstream", "Intergenic"),
  addFlankGeneInfo = FALSE,
  overlap = "TSS", verbose = FALSE
)
atac.peak.annots_full <- as_tibble(peakAnno) %>%
  select(!starts_with("PB")) %>%
  mutate(
    seqnames = gsub("chr", "peak", seqnames),
    chr = gsub("peak", "", seqnames)
  ) %>%
  unite("peak_id", "seqnames", "start", "end", remove = FALSE) %>%
  select(-seqnames)
```

```{r id_matching, message=FALSE, warning=FALSE, echo =FALSE}
# I am using v98 annotations since protein id's are v98
# Saved all gene annotations from biomart using Sept2019 archive http://sep2019.archive.ensembl.org/biomart
all_annot_v98 <- read_tsv( here("../pQTL_website/_data","ensembl_gene_annotations_v98.txt")) %>% 
rename( "ensembl_gene_id" = "Gene stable ID",
          "protein_id" = "Protein stable ID",
          "gene_start" = "Gene start (bp)",
          "gene_end" = "Gene end (bp)",
          "gene_chr" = "Chromosome/scaffold name",
          "mgi_symbol" = "MGI symbol",
          "gene_biotype" = "Gene type"
        ) %>% 
  group_by(ensembl_gene_id) %>% 
  mutate( n_gene_id = n()) %>% # there are cases where gene id is repeated with a valid protein id + NA
  ungroup() %>% 
  filter( !(n_gene_id >1 & is.na(protein_id) & gene_biotype =="protein_coding") ) %>% 
  select(-n_gene_id)
# make all.genes, all.prots, atac.peak.annots
all.genes <- all_annot_v98 %>% 
  filter( ensembl_gene_id %in% colnames(expr.esc_rna)) %>% 
  select( -protein_id) %>% 
  distinct() 
# all genes:
# 14405/14547 has annotations in v98, 141 missing annotations
# closer look at some show that the ensembl gene ids have been changed. Not sure what the best way in updating those are. 
# Ex: ENSMUSG00000029333 matches Rasgef1b in MGI which has ENSMUSG00000089809 as the new ensembl gene id in v98. I can't find the connection other than manually searching for it! 
  
  
all.prots <- all_annot_v98 %>%
  filter( protein_id %in% colnames(expr.esc_prot)) %>% 
  filter( !is.na(mgi_symbol) ) # I am removing the ones without an annotation 
# 7432/7432 has annotations!
all_atac_peak_gene_annots <- all_annot_v98 %>%  
  filter( ensembl_gene_id %in% atac.peak.annots_full$ENSEMBL) %>% 
  select( -protein_id) %>% 
  distinct() %>% 
  filter( !is.na(mgi_symbol) | !is.na(ensembl_gene_id)) # I am removing the ones without an annotation (n = 3)
# re-writing the annotations. 
atac.peak.annots_full <- atac.peak.annots_full %>% 
  select(peak_id, 
         peak_start = start, 
         peak_end = end, 
         peak_width = width, 
         peak_strand = strand, 
         annotation, 
         distanceToTSS, 
         "ensembl_gene_id"="ENSEMBL") %>% 
  left_join( all_atac_peak_gene_annots)
atac.peak.annots <- atac.peak.annots_full %>% 
  filter( !is.na(mgi_symbol))
# 100430/102173 has annotations
# df with all ensembl gene ids /protein ids found in the three data sets. 
all_omics_ids <- full_join( all_atac_peak_gene_annots, all.genes) %>% 
  full_join( all.prots) #%>% 
  #left_join( select(atac.peak.annots, ensembl_gene_id, peak_id)) # I can add peak_ids if I want to! 
# saving for later use
# save( all.genes, all.prots, atac.peak.annots, file = here("_data","updated_annotations_04042022.RData"))
```

```{r load_qtl_data, warning=FALSE, message=FALSE, results="hide" , echo =FALSE}

######### merged caQTL, eQTL, pQTL
# see "_src/" for details on how they are merged
load(here("../pQTL_website/_data/peaks_comparison_10Mb_noPoly_v4_w_atac.RData")) # peaks.esc.overlap2, peaks.esc.prot.rna, peaks.esc.rna.atac

######### pQTL data with effects
load(here("../pQTL_website/_data/DO_mESC_pQTL_effects_noPoly_v2.RData"))
peaks.esc_prot <- peaks # all pQTL peaks
effects_blup.esc_prot <- effects_blup # all pQTL effects using blup
rm(peaks, effects_blup, effects_std)
# adding effects to pQTL peaks
peaks.esc_prot.blup <- cbind(peaks.esc_prot, effects_blup.esc_prot) %>% 
  dplyr::rename("protein_id" = "phenotype")
colnames(peaks.esc_prot.blup) <- c(
  colnames(peaks.esc_prot.blup)[1:2],
  paste0(colnames(peaks.esc_prot.blup)[3:dim(peaks.esc_prot.blup)[2]], ".esc_prot")
)
# adding annotations to pQTL with effects
peaks.esc_prot.wEffs <- peaks.esc_prot.blup %>% 
  left_join( all.prots) %>% 
  mutate(midpoint = (gene_start + gene_end) / 2) %>% 
  mutate( same_chrom =  (peak_chr == gene_chr),
          diff = abs(midpoint - interp_bp_peak.esc_prot)) %>% 
  mutate( local.esc_prot = ifelse( same_chrom & 
                            diff < 10e06, TRUE, FALSE
    )) 
# adding RNA + protein details to the effects data frame for all eQTL & pQTL peaks.
peaks.esc_prot.blup <- peaks.esc_prot.blup %>% 
  left_join(., mutate(peaks.esc.prot.rna,
                      peak_cM.esc_prot = as.numeric(peak_cM.esc_prot),
                      peak_cM.esc_rna = as.numeric(peak_cM.esc_rna)
))
######### eQTL data with effects
load(here("../pQTL_website/_data/ESC_eQTL_effects.RData"))
peaks.esc_rna <- peaks
effects_blup.esc_rna <- effects_blup
effects_std.esc_rna <- effects_std
rm(peaks, effects_blup, effects_std)
# Adding effects to eQTL peaks
peaks.esc_rna.blup <- cbind(peaks.esc_rna, effects_blup.esc_rna) %>%
  dplyr::rename("ensembl_gene_id" = "phenotype")
colnames(peaks.esc_rna.blup) <- c(
  colnames(peaks.esc_rna.blup)[1:2],
  paste0(colnames(peaks.esc_rna.blup)[3:dim(peaks.esc_rna.blup)[2]], ".esc_rna")
)
# adding annotations to eQTL peaks with effects
peaks.esc_rna.wEffs <- peaks.esc_rna.blup %>% 
  left_join( all.genes) %>% 
  mutate(midpoint = (gene_start + gene_end) / 2) %>% 
  mutate( same_chrom =  (peak_chr == gene_chr),
          diff = abs(midpoint - interp_bp_peak.esc_rna)) %>% 
  mutate( local.esc_rna = ifelse( same_chrom & 
                            diff < 10e06, TRUE, FALSE
    ))
# adding RNA + protein details to the effects data frame for all eQTL & pQTL peaks.
peaks.esc_rna.blup <- peaks.esc_rna.blup %>% # 
  left_join(., mutate(peaks.esc.prot.rna,
  peak_cM.esc_rna = as.numeric(peak_cM.esc_rna),
  peak_cM.esc_prot = as.numeric(peak_cM.esc_prot)
))
######### caQTL data
load(here("../pQTL_website/_data/atac_effects_at_qtl_peaks.RData")) # peaks
atac.eff_blup <- cbind(peaks, effects_blup)
peaks.esc_atac <- peaks
rm(peaks, effects_blup, effects_std)
# renaming effects
atac.eff_blup %>% 
  select( peak_id = phenotype, peak_chr, lod.esc_atac = lod, interp_bp_peak.esc_atac = interp_bp_peak,
          A.esc_atac = A, 
          B.esc_atac = B,
          C.esc_atac = C,
          D.esc_atac = D, 
          E.esc_atac = E, 
          F.esc_atac = `F`, 
          G.esc_atac = G,
          H.esc_atac = H
          ) -> peaks.esc_atac.blup
# add local distant to peaks.esc_atac.blup
peaks.esc_atac.blup %>% 
  left_join( ., atac.peak.annots_full %>% 
                    mutate(midpoint = (peak_start+peak_end)/2,
                           atac_chr = gsub("_","",substr(peak_id,5,6))) %>% 
                    select( peak_id, midpoint, atac_chr)
             ) %>%  # adding location information for each peak
  mutate( 
    local.esc_atac = (peak_chr == atac_chr & abs( midpoint - as.numeric(interp_bp_peak.esc_atac)) < 10e06 ),
    ) -> peaks.esc_atac.blup
# fixing some probs -- for association mapping 
attributes(probs.esc_rna)$is_x_chr <-  attributes(probs.esc_atac)$is_x_chr
attributes(probs.esc_prot)$is_x_chr <-  attributes(probs.esc_atac)$is_x_chr
#### merge effects
peaks.merged.blup <- full_join( (peaks.esc_rna.blup %>%  select(-bp_diff, -same_chrom) ),
                                 (peaks.esc_prot.blup %>% select(-bp_diff, -same_chrom)) 
                                )
peaks.esc.overlap.wEffs <- peaks.esc.overlap2 %>% 
  mutate( 
    peak_cM.esc_rna = as.numeric(peak_cM.esc_rna),
    peak_cM.esc_prot = as.numeric(peak_cM.esc_prot),
    before.esc_atac = before, 
    after.esc_atac = after
    ) %>% 
  left_join( peaks.esc_atac.blup %>%  select(-midpoint)) %>% 
  left_join( (peaks.esc_rna.wEffs %>%  select( ensembl_gene_id, peak_chr, lod.esc_rna , before.esc_rna, after.esc_rna, paste0(LETTERS[1:8],".esc_rna"))) ) %>% 
  left_join( (peaks.esc_prot.wEffs %>% select( protein_id, peak_chr, lod.esc_prot , before.esc_prot, after.esc_prot, paste0(LETTERS[1:8],".esc_prot"))) )
```

```{r shared_data, warning=FALSE, message=FALSE, echo =FALSE}
#### Shared data
# get the set of shared genes btw protein + rna
shared.genes <- inner_join( all.genes, all.prots)
# shared in all three data sets
threeway.shared.genes <- atac.peak.annots %>% 
  filter( ensembl_gene_id %in% shared.genes$ensembl_gene_id) %>%  # get genes found in all three omics
  left_join( all.prots) %>% 
  group_by(ensembl_gene_id) %>%
  mutate(new_symbol=paste0(mgi_symbol,"_",1:n()), new_gene_id=paste0(ensembl_gene_id,"_",1:n())) %>% 
  ungroup()
# get the set of shared samples
shared.samples <- intersect(
  rownames(expr.esc_rna)[!grepl("repB", rownames(expr.esc_rna))],
  rownames(expr.esc_prot)[!grepl("repB", rownames(expr.esc_prot))]
)
shared.probs.esc_prot <- probs.esc_prot[ind = shared.samples]
shared.probs.esc_rna <- probs.esc_rna[ind = shared.samples]
# load sample matching table for atac
# load matching sample names for atac-seq data
all.ids <- read_tsv(here("../pQTL_website/_data/Corrected_PBID_SampleMatch_Key_v4.tsv")) %>%
  mutate(sampleid.esc_prot = gsub("rep1", "repA", sampleid.esc_prot))
correct.atac.ids <- read_tsv(here("../pQTL_website/_data/ATAC_sampleids_corrected.tsv"))
sample.matches <- correct.atac.ids %>%
  mutate(sampleid = ifelse(endsWith(correct_id, "B"), paste0(correct_id, "_repB"), paste0(correct_id, "_repA"))) %>%
  mutate(sampleid = ifelse(endsWith(correct_id, "C"), paste0(correct_id, "_repC"), sampleid)) %>%
  full_join(., tibble(rna_id = rownames(expr.esc_rna)), by = c("sampleid" = "rna_id")) %>%
  full_join(., tibble(prot_id = rownames(expr.esc_prot)), by = c("sampleid" = "prot_id"))
sample.matches.nodups <- filter(sample.matches, !endsWith(sampleid, "B"), !endsWith(sampleid, "C"))
# there are some rna/protein samples that are not found in the atac data
# I am adding the top_muga ids for those
missing.samples <- filter(
  all.ids,
  (sampleid.esc_rna %in% filter(sample.matches.nodups, is.na(top_muga))$sampleid & mixup.esc_rna == FALSE) |
    (sampleid.esc_prot %in% filter(sample.matches.nodups, is.na(top_muga))$sampleid & mixup.esc_prot == FALSE)
) %>%
  select(top_muga, sampleid.esc_rna, sampleid.esc_prot, PBID) %>%
  mutate(sampleid = ifelse(is.na(sampleid.esc_rna), sampleid.esc_prot, sampleid.esc_rna)) %>%
  select(-sampleid.esc_rna, -sampleid.esc_prot) %>%
  distinct()
sample.matches.all <- sample.matches.nodups %>%
  full_join(., missing.samples, by = c("sampleid")) %>%
  mutate(
    top_muga = ifelse(!is.na(top_muga.x), top_muga.x, top_muga.y),
    PBID = ifelse(!is.na(PBID.x), PBID.x, PBID.y)
  ) %>%
  select(sampleid, correct_id, ATAC, top_muga, PBID) %>%
  filter(!is.na(top_muga)) # remove the ones that are still NA +  add them back below
# there are still 2 samples that are mismatches with missing top_muga ids
# PB359.33_repA and PB361.72_repA, I am changing those manually using all.ids
mixup.samples <- tibble(
  sampleid = c("PB359.33_repA", "PB361.72_repA"),
  top_muga = c("A903DI1B.C03", "A903DI12.E09"),
  PBID = c("PB359.33", "PB361.72"),
  ATAC = c(NA, NA), correct_id = c(NA, NA)
)
sample.matches.all <- sample.matches.all %>%
  rbind(., mixup.samples)
# For mapping later
# I have factors for all 200 lines + can map with all
# I need to get the genoprobs for all!
# load all 3 and subset + merge
atac.samples <- sample.matches.all %>% 
  filter(ATAC %in% colnames(counts.norm2))
other.samples <- sample.matches.all %>% filter(
  sampleid %in% rownames(expr.esc_rna) |
    sampleid %in% rownames(expr.esc_prot),
  !ATAC %in% colnames(counts.norm2)
)
sample.matches.rna <- sample.matches.all %>%
  filter(sampleid %in% rownames(expr.esc_rna), !is.na(ATAC))
sample.matches.prot <- sample.matches.all %>%
  filter(sampleid %in% rownames(expr.esc_prot), !is.na(ATAC))
# Sharing and setting names
sub.probs.esc_prot <- probs.esc_prot[ind = other.samples$sampleid]
new.ids <- other.samples$top_muga
names(new.ids) <- other.samples$sampleid
sub.probs.esc_prot <- replace_ids(sub.probs.esc_prot, ids = new.ids)
sub.probs.esc_atac <- probs.esc_atac[ind = atac.samples$ATAC]
new.ids2 <- atac.samples$top_muga
names(new.ids2) <- atac.samples$ATAC
sub.probs.esc_atac <- replace_ids(sub.probs.esc_atac, ids = new.ids2)
attributes(sub.probs.esc_atac)$crosstype <- "DO"
attributes(sub.probs.esc_atac)$is_x_chr <- attributes(sub.probs.esc_prot)$is_x_chr
merged.probs <- rbind(sub.probs.esc_atac, sub.probs.esc_prot)
kinship <- calc_kinship(merged.probs, type = "loco")
other.covar <- covar.esc_prot %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  filter(rowname %in% other.samples$sampleid) %>%
  left_join(other.samples, by = c("rowname" = "sampleid")) %>%
  column_to_rownames("top_muga") %>%
  select(sex) %>%
  as.matrix()
atac.covar <- covarTidy.esc_atac[covarTidy.esc_atac$PB_ID %in% atac.samples$ATAC, c("PB_ID", "sex")] %>%
  left_join(atac.samples, by = c("PB_ID" = "ATAC")) %>%
  mutate(sex = ifelse(sex == "F", 0, 1)) %>%
  column_to_rownames("top_muga") %>%
  select(sex) %>%
  as.matrix()
merged.covar <- rbind(atac.covar, other.covar)
threeway.shared.samples <- filter(sample.matches, 
                                  sampleid %in% shared.samples, 
                                  ATAC %in% colnames(atac.counts)) %>%
  left_join(., sample.matches.all)
threeway.shared.probs <- merged.probs[ind = threeway.shared.samples$top_muga]
shared.atac.data <- t(counts.norm2[threeway.shared.genes$peak_id, threeway.shared.samples$ATAC])
colnames(shared.atac.data) <- threeway.shared.genes$new_symbol
rownames(shared.atac.data) <- threeway.shared.samples$sampleid
shared.rna.data <- (expr.esc_rna[threeway.shared.samples$sampleid, threeway.shared.genes$ensembl_gene_id])
colnames(shared.rna.data) <- threeway.shared.genes$new_symbol
shared.prot.data <- (expr.esc_prot[threeway.shared.samples$sampleid, threeway.shared.genes$protein_id])
colnames(shared.prot.data) <- threeway.shared.genes$new_symbol
# get shared qtl peaks
peaks.shared.genes <- peaks.esc.overlap2 %>%
  filter(ensembl_gene_id %in% shared.genes$ensembl_gene_id |
   protein_id %in% shared.genes$protein_id) %>%
  mutate(
    lod.esc_rna = ifelse(is.na(lod.esc_rna), 0, lod.esc_rna),
    lod.esc_prot = ifelse(is.na(lod.esc_prot), 0, lod.esc_prot)
  ) %>%
  mutate(peak_cM.esc_prot = as.numeric(peak_cM.esc_prot), 
         peak_cM.esc_rna = as.numeric(peak_cM.esc_rna))
```

```{r load_gene_sets, warning=FALSE, message=FALSE, results='hide', echo =FALSE}
# Complex members retreieved from Ori et al 2016, additional file2:  https://variablecomplexes.embl.de/html-tables/Additional_file_2.html
complexes <- read_xlsx(path = here("../pQTL_website/_data", "protein_complexes.xlsx")) # Assuming they are using 2016 human ensembl id's. but not worrying about it atm.
complex.genes <- (str_split(complexes$`Member identifiers (human Ensembl gene)`, " "))
names(complex.genes) <- complexes$`Complex Name`
# # running the code below and saving to RData file to re-use
# library(biomaRt)
# mart_human <- useMart("ensembl", dataset = "hsapiens_gene_ensembl", host = "http://sep2019.archive.ensembl.org")
# mart_mouse <- useMart("ensembl", dataset = "mmusculus_gene_ensembl", host = "http://sep2019.archive.ensembl.org")
# matched.ids <- getLDS(
#   attributes = c("ensembl_gene_id"),
#   filters = "ensembl_gene_id",
#   values = unique(unlist(complex.genes)),
#   mart = mart_human,
#   attributesL = c("ensembl_gene_id"),
#   martL = mart_mouse
# )
# complex.gene.list <- matched.ids %>%
#   rename( human_ids = `Gene.stable.ID`,
#           ensembl_gene_id = `Gene.stable.ID.1`) %>%
#   left_join(
#     full_join( all.genes, all.prots)
#     ) %>%  # adding all annotations
#   filter( ensembl_gene_id %in% all.genes$ensembl_gene_id |
#             protein_id %in% all.prots$protein_id) # filtering for the genes + proteins in our dataset
# # note that 20 human ids don't have 1-1 mapping, they match 10 genes. Mainly because that complex subunit is not found in mice.
# # complex.gene.list %>%  group_by(ensembl_gene_id, protein_id) %>%  mutate( n =n()) %>% filter( n>1) %>% view()
# 
# save(complex.gene.list, file = here("_data","complex_gene_list.RData"))
load(here("../pQTL_website/_data/complex_gene_list.RData"))
# getting TF list
all.tfs <- readxl::read_xlsx(path = here("../pQTL_website/_data/Mouse_TFs_TcoFDB.xlsx")) %>%
  select(Symbol) %>%
  rename(mgi_symbol = Symbol)

  
# esc gene lists
# load lists
# get gene lists for heatmaps
# From Xu et al 2014 paper
# I fed the list to MGI and only Tcf3 has an old symbol Tcf7l1
xu.genes <- read_xlsx(here("../pQTL_website/_data/GeneList-EScell-PluMaint.xlsx"), sheet = 1) %>%
   mutate( Gene = ifelse( Gene =="Tcf3", "Tcf7l1",Gene)) %>% # updating some names to match our mgi_symbols
  left_join(., all_omics_ids, by = c("Gene" = "mgi_symbol")) # Label = "pluripotency", "mesoderm" ...etc.
# From Kalkan et al 2017 paper
# I fed the list to MGI and only Tcf3 has an old symbol Tcf7l1
kalk.genes <- read_xlsx(here("../pQTL_website/_data/GeneList-EScell-Kalkan2017.xlsx"), sheet = 1) %>%
  mutate( Gene = ifelse( Gene =="Tcf3", "Tcf7l1",Gene)) %>% # updating some names to match our mgi_symbols
  left_join(., all_omics_ids, by = c("Gene" = "mgi_symbol")) # Label = " naive_pluripotency" ... etc.
# These are :Potential pluripotency associated genes from multiple genome-wide RNAi screens studies.
# downloaded from: http://www.maayanlab.net/ESCAPE/download/RNAihits.txt.zip
# I fed the list to MGI and many genes have an old symbol, I added those to the file as well - RNAhits_updated.xlsx
# Here merging that to the old list to include both old + new symbols in case we have the older ones in our annotations
escape.genes <- read_tsv(here("../pQTL_website/_data/RNAihits.txt")) %>% # these have capital letter gene names
  select(geneName) %>%
  rbind( read_xlsx(here("../pQTL_website/_data/RNAihits_updated.xlsx"), sheet = 1) %>% 
           select(geneName = Symbol) %>% 
           filter(!is.na(geneName))
  ) %>% 
  distinct() %>% 
  mutate(mgi_symbol = paste0((substring(geneName, 1, 1)), tolower(substring(geneName, 2)))) %>%
  select(-geneName) %>%
  left_join( all_omics_ids) 
# These are: Lists of ESC and diffferentiating ESC-specific proteins. We collected 19,801 ESC and diffferentiating ESC-specific proteins from proteomics. 
# downloaded from: http://www.maayanlab.net/ESCAPE/download/proteomicsESC.txt.zip
# url <- "http://www.maayanlab.net/ESCAPE/download/proteomicsESC.txt.zip"
# download.file(url, here("_data/proteomicsESC.txt.zip"))
# zip::unzip(here("_data/proteomicsESC.txt.zip"), overwrite = F, exdir = "_data/")
# I uploaded the mouse subsets to bimart + converted them manually
escape_diff_genes <- read_tsv( "../pQTL_website/_data/escape_diff_genes.tsv") %>% 
  rename( ensembl_gene_id = `Gene stable ID`, mgi_symbol = `MGI symbol`,geneID = `NCBI gene (formerly Entrezgene) ID`) %>% 
  left_join( ., read_tsv( here("../pQTL_website/_data/proteomicsESC.txt") ) , by = c("geneID" ) ) %>% 
  left_join( all_omics_ids) 
  
dan.genes <- data.frame(
  mgi_symbol = c(
    "Esrrb", "Klf4", "Nanog", "Pou5f1", "Sox2",
    "Chrd", "Fgf8", "Otx2", "Pax6", "Sox1",
    "Gata4","Gata6", "Pdgfra", "Sox17", "Sox7", 
    "Bmp2","Kdr", "Mixl1", "T", "Wnt3a"
  ),
  type = c(rep("pluripotency", 5), rep("ectoderm", 5), rep("endoderm", 5), rep("mesoderm", 5))
) %>%
  left_join(., all_omics_ids)
ortmann_genes <- tibble(
  type = c( rep("primitive endoderm",4),
                  "neurectoderm", "definitive endoderm", "ectoderm","epiblast",
                  rep("Wnt-related genes",8),
            rep("pluripotency",8)),
  mgi_symbol = c( "Gata6","Pdgfra","Sox7","Sparc1",
                  "Sox1","Sox17","Otx2","Fgf5", 
            "Axin2","Lef1","Csnk1e", "Gnai1", "Nkd1", "Tcf7", "Wwnt5b", "Fgfrl1",
            "Pou5f1","Nanog","Klf2","Klf4","Dppa5","Esrrb","Tfcp2l1","Sox2")
)
all.esc.genes <- tibble(mgi_symbol = 
                          unique(c(escape.genes$mgi_symbol, 
                                   xu.genes$Gene,
                                   kalk.genes$Gene,
                                   "Etv5","Rbpj","Fgf15","Dusp9","Id1","Id3","Id4")) # adding some from other papers
                        ) %>%
  left_join(all_omics_ids) %>%
  mutate(s_Mbp = (gene_start / 1e06) - 0.002, e_Mbp = (gene_end / 1e06) + 0.002)
# downloading the ccvariants
# download.file("https://ndownloader.figshare.com/files/9746485", "_data/cc_variants.sqlite")
# read in variant definitions
var.types <- read_xlsx(path = here("../pQTL_website/_data/Variant_Definitions.xlsx")) %>%
  rename(type = `SO term`, score = `Functional consequence score`)

bulut.genes <- readxl::read_xlsx(path = here("../pQTL_website/_data/Bulut_2018_TableS6.xlsx")) %>%
  rename(mgi_symbol = OFFICIAL_GENE_SYMBOL) %>%
  select(-Name, -Species) %>%
  rowwise() %>%
  mutate(mgi_symbol = paste0(substr(mgi_symbol, 1, 1), tolower(substr(mgi_symbol, 2, nchar(mgi_symbol))))) %>%
  rbind(tibble(mgi_symbol = c("Tip60", "Ep400", "Kat5", "Kat8", "Ino80", "Ash2l", "Chd1"))) %>% # these are mentioned by name in the papers
  left_join(all_omics_ids)
# 2c-like cell genes from Hendrickson 2017, table S8
# get the ensembl gene ids, they are from december 2011 acc to the table details
# find most recent versions of the ids using ensembl id converter
# Requested ID	Matched ID(s)
genes_2c <- read_xlsx( path = here("../pQTL_website/_data","Hendrickson_2017_tableS8_2clike_gene_list.xlsx"), sheet = 2) %>% 
  select( ensembl_gene_id_dec2011 = `Requested ID`,	
          ensembl_gene_id = `Matched ID(s)`) %>% 
  left_join( all_annot_v98) %>%
  left_join( all_omics_ids) %>% 
  filter( !is.na(mgi_symbol)) 
```

```{r load_mediations, warning=FALSE, message=FALSE, echo =FALSE}
# load mediation results
load(here("../pQTL_website/_data/DO_mESC_eQTL_protein_mediation_lod6_v3_merged.RData")) # eqtl_rna_meds
load(here("../pQTL_website/_data/DO_mESC_eQTL_RNA_mediation_lod6_v3_merged.RData")) # eqtl_prot_meds
load(here("../pQTL_website/_data/DO_mESC_pQTL_rna_mediation_lod6_noPoly_v4_merged.RData")) # pqtl_rna_meds
load(here("../pQTL_website/_data/DO_mESC_pQTL_protein_mediation_lod6_noPoly_v4.RData")) # results
pqtl_prot_meds <- results
load(here("../pQTL_website/_data/DO_mESC_caQTL_rna_mediation_merged_v2.RData"))
load(here("../pQTL_website/_data/DO_mESC_caQTL_prot_mediation_merged_v2.RData"))
```

```{r misc, warning=FALSE, message=FALSE, results="hide" , echo =FALSE}
## lifr genotypes
# get_LIFR_genotypes
# using Dan's code to get LIFR genotypes for the full list of animals
probs <- merged.probs
markers <- tibble(name = dimnames(probs[[15]])[[3]]) %>%
  mutate(name2 = name) %>%
  separate(name2, into = c("chrom", "pos"), sep = "_", convert = TRUE)
# LIFr SNP is chr15:7116944 (rs50454566)
mm <- filter(markers, chrom == "15", pos > 7090000, pos < 7130000) # 3 markers
probs2 <- probs$`15`[, , mm$name]
closest_geno <- function(p, tol = 0.01) {
  if (sum(abs(p - c(1, 0))) < tol) {
    return("A")
  }
  if (sum(abs(p - c(0, 1))) < tol) {
    return("B")
  }
  if (sum(abs(p - c(0.5, 0.5))) < tol) {
    return("H")
  }
  return(NA)
}
call_geno <- function(mat) {
  # mat is nsamp*8 (haps)
  # A = A_J
  # B = B6
  # C = 129
  # D = NOD
  # E = NZO
  # F = CAST
  # G = PWK
  # H = WSB
  # I want to divide NOD + CAST + PWK + WSB
  # vs. the other four
  grp <- c("A", "A", "A", "B", "A", "B", "B", "B")
  collapsed <- apply(mat, 1, function(x) tapply(x, grp, sum))
  apply(collapsed, 2, closest_geno)
}
probs3 <- apply(probs2, 3, call_geno)
# assert_that(noNA(probs3))
one <- probs3[, 1] # marker left of Lifr
two <- probs3[, 2] # closest marker to Lifr
three <- probs3[, 3] # marker right of Lifr
# "PB360.49" has an ancestry switch between markers 1 & 2!
# Get samples in group A (inbred strains) and group B (wild-derived + NOD)
inbred <- rownames(probs[[1]])[one == "A" & two == "A" & three == "A"]
wildder <- rownames(probs[[1]])[one == "B" & two == "B" & three == "B"]
hets <- rownames(probs[[1]])[one == "H" & two == "H" & three == "H"]
# cat(inbred, sep="\n", file="lifr_genotype_inbred.txt")
# cat(wildder, sep="\n", file="lifr_genotype_wildder.txt")
# cat(hets, sep="\n", file="lifr_genotype_het.txt")
#
data_frame(
  lifr_geno = factor(c(rep("Ref", length(inbred)), c(rep("Alt", length(wildder))), c(rep("Het", length(hets))))),
  lifr = factor(c(rep(0, length(inbred)), c(rep(1, length(wildder))), c(rep(2, length(hets))))),
  rowname = c((inbred), (wildder), (hets))
) %>%
  mutate(rowname = ifelse(is.na(rowname), "A903DI1C.C12", rowname)) -> covar.lifr

merged.covar %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  full_join(covar.lifr) %>%
  column_to_rownames("rowname") -> merged.covar2
merged.covar3 <- model.matrix(~ sex + lifr, merged.covar2)[, -1]
shared.probs <- merged.probs[ind = threeway.shared.samples$top_muga]
attributes(shared.probs)$is_x_chr <-  attributes(probs.esc_atac)$is_x_chr
shared.covar <- merged.covar[ threeway.shared.samples$top_muga,,drop=FALSE]
shared.kinship <- calc_kinship(shared.probs, type = "loco")

# prep some stuff for plotting:
uchr <- c(as.character(1:19), "X")
cl <- dplyr::select(map_dat2, chr, pos_bp) %>%
  group_by(chr) %>%
  dplyr::summarize(len = max(pos_bp))
clp <- with(cl, setNames(len, chr))
chrom_lens <- setNames(as.numeric(clp[uchr]), uchr)
chrom_lens_offset <- cumsum(chrom_lens) - chrom_lens
chrom_lens_midpt <- chrom_lens_offset + chrom_lens / 2
all.genes2 <- all.genes %>% mutate(midpoint = (gene_start + gene_end) / 2)
all.genes2$cumsum_bp_gene <- all.genes2$midpoint + chrom_lens_offset[all.genes2$gene_chr]
all.prots2 <- all.prots %>% mutate(midpoint = (gene_start + gene_end) / 2)
all.prots2$cumsum_bp_gene <- all.prots2$midpoint + chrom_lens_offset[all.prots2$gene_chr]
map_dat2 <- rename(map_dat2, pos_cM = pos)
map_dat2 <- mutate(map_dat2, pos_cM = as.numeric(pos_cM))

#  rna / prot / chromatin
qtl.colors <- c( rna = "#228833", 
                 prot = "#4477AA", 
                 atac = "#EE6677",
                 shared = "#AA3377")
#founder_colors <- c("#FFDC00", "#888888", "#F08080", "#0064C9", "#7FDBFF", "#2ECC40", "#FF4136", "#B10DC9")
founder_colors <- c(AJ = "#F0E442", B6 = "#555555", `129` = "#E69F00", NOD = "#0072B2",
   NZO = "#56B4E9", CAST = "#009E73", PWK = "#D55E00", WSB = "#CC79A7")
## adding cc_variants for association mapping
query_variants <- create_variant_query_func(here("../pQTL_website/_data/cc_variants.sqlite"))
query_genes <- create_gene_query_func(here("../pQTL_website/_data/mouse_genes_mgi.sqlite"))
# sex calls from Alex Stanton retrieved via slack on July 28, 2020
sex_calls <- read_csv(here("../pQTL_website/_data/ploidystatus_AS_07282020.csv"))
# for LOLA
regionDB = loadRegionDB(here("_data/LOLA/nm/t1/resources/regions/LOLACore/mm10/"))

```


```{r save_RData_for_figshare, echo=FALSE, eval=FALSE}

# 1.	Proteomics data: DO_mESC_proteomics.RData file contains the following objects.
# •	expr.esc_prot
# •	exprZ.esc_prot
# •	all.prots
# •	probs.esc_prot
# •	covar.esc_prot
# •	covarTidy.esc_prot
# •	kinship_loco.esc_prot
# •	pmap, gmap, map_dat2

covarTidy.esc_prot <- left_join(covarTidy.esc_prot, covar.lifr, by = c("top_muga"="rowname"))

save( file = here("_figshare","DO_mESC_proteomics.RData"),
      expr.esc_prot,
      exprZ.esc_prot, 
      all.prots, 
      shared.genes, 
      shared.samples,
      probs.esc_prot,
      covar.esc_prot,
      covarTidy.esc_prot,
      kinship_loco.esc_prot,
      pmap,
      gmap,
      map_dat2)

# 2.	Complexes: The list of protein complexes and the genes within them as retrieved from Ori et al., 20161.
save( complexes,
      file = here("_figshare","Complex_gene_list.RData")
)

# 3.	Data used in covariation analysis: DO_mESC_covar.RData file contains the following objects:
# •	expr.esc_rna
# •	all.genes
# •	all.tfs
# •	counts.norm2
# •	threeway.shared.samples
# •	threeway.shared.genes
# •	atac.peak.annots

save(
  expr.esc_rna,
  all.genes, 
  all.tfs,
  counts.norm2,
  threeway.shared.genes,
  threeway.shared.samples,
  atac.peak.annots,
  file = here("_figshare","DO_mESC_covar.RData")
)

# 4.	pQTL data: DO_mESC_pQTL.RData file contains the following objects.
# •	esc.prot.scans – pQTL scans for all 7,432 proteins.
# •	esc.rna.scans
# •	pqtl_prot_meds – Mediation results of pQTL peaks using protein abundances.
# •	pqtl_rna_meds – Mediation results of pQTL peaks using transcript abundances.
# •	do_mesc_pqtl_perms – Permutation results with alpha = 0.05 for all 7,432 proteins.
load("/projects/munger-lab/projects/DO_mESC/proteomics/pqtl_mapping_SA/DO195_mESC_pQTL_scans_noPoly_v2.RData") # esc.prot.scans
load("/projects/munger-lab/projects/DO_mESC/rna_seq/qtl_mapping/total_gene_expression/eqtl_grid69k_pe/DO185_mESC_paired_eQTL_scans.RData") # esc.rna.scans
do_mesc_pqtl_prot_meds <- pqtl_prot_meds
do_mesc_pqtl_rna_meds  <- pqtl_rna_meds
# need to load the permutation results too
# load and only keep the alpha = .05 results
load(here("/projects/munger-lab/projects/DO_mESC/proteomics/pqtl_mapping_SA/pQTL_perms_01_005.RData"))
do_mesc_pqtl_perms <- perms %>% 
  filter(alpha == 0.05)
rm(perms)

save(
  esc.prot.scans,
  esc.rna.scans,
  pqtl_prot_meds,
  pqtl_rna_meds,
  do_mesc_pqtl_perms,
  file = here("_figshare","DO_mESC_pQTL.RData")
)

# 5.	Overlap across QTL peaks data: 
# •	peaks.esc_prot
# •	peaks.esc.overlap.wEffs
# •	exprZ.esc_rna, probs.esc_rna, covar.esc_rna, kinship_loco.esc_rna
# •	counts.normZ, probs.esc_atac, covar.esc_atac, kinship_loco.esc_atac
# •	threeway.shared.samples
# •	threeway.shared.genes
save(
  peaks.esc_prot,
  peaks.esc.overlap.wEffs,
  exprZ.esc_rna, probs.esc_rna, covar.esc_rna, kinship_loco.esc_rna,
  counts.normZ, probs.esc_atac, covar.esc_atac, kinship_loco.esc_atac,
  threeway.shared.samples,
  threeway.shared.genes,
  file = here("_figshare","DO_mESC_QTL_peaks.RData")
)

# 6.	Data used in MOFA and integration results: DO_mESC_MOFA.RData
# •	all_df_shared
# •	all_df_shared_model
# •	shared_samples_metadata
# •	shared.probs
# •	shared.kinship
# •	shared.covar

# •	gmap, pmap, map_dat2


library(MOFA2)
# load data from the MOFA project
load("/projects/munger-lab/projects/DO_mESC/proteomics/pQTL_website/_data/MOFA_data_prep_04112022.RData") # has all the data frames
rm(all_df, 
   all_df_top5k, all_df_top5k_shared,
   trans_df_top15k, trans_df_top15k_shared,
   herit_df_shared)

# load MOFA model + process it
# create the MOFA object
all_df_shared_MOFAobject <- create_mofa(all_df_shared)

# Load in model results generated using _src/MOFA_train_model_CPU.r
all_df_shared_model <- load_model(here("../pQTL_website/_data/all_df_shared_30factors_2022-04-11.hdf5"), remove_inactive_factors = F)
# Warning message:
# In .quality_control(object, verbose = verbose) :
#   Factor(s) 3 are strongly correlated with the total number of expressed features for at least one of your omics. Such factors appear when there are differences in the total 'levels' between your samples, *sometimes* because of poor normalisation in the preprocessing steps.

# Removing Factor 3 that shows high correlation to the total # of expressed features.
all_df_shared_model <- subset_factors( all_df_shared_model, factors = c(1:2, 4:30))

# Removing Factors that don't explain at least 1% of variation in a data set. 
all_df_shared_factor_var <- all_df_shared_model@cache$variance_explained$r2_per_factor %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  filter(single_group.Chromatin > 1 |
           single_group.Protein > 1 |
           single_group.Transcript > 1) %>%
  column_to_rownames()
all_df_shared_model_filtered <- subset_factors(all_df_shared_model,
                                        factors = as.numeric(gsub("Factor","",rownames(all_df_shared_factor_var))))

# updating the sexes in the metadata to use Female/Male instead of 0s and 1s.
merged_metadata <- merged.covar2 %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  select(-lifr) %>%
  rename("sample"="rowname")
shared_samples_metadata <- all_df_shared_model_filtered@samples_metadata %>%
  left_join(.,merged_metadata) %>%
  mutate(sex = ifelse(sex ==0,"Female", "Male"))
samples_metadata(all_df_shared_model_filtered) <- shared_samples_metadata

# Get Factor weights
all_df_shared_factors <- get_factors(all_df_shared_model_filtered,
  factors = "all",
  as.data.frame = T
)
# Add sample details and convert to matrix with Factors in rows and samples in columns.
all_df_shared_factors_mat <- all_df_shared_factors %>%
  pivot_wider(id_cols = "sample", names_from = "factor", values_from = "value") %>%
  left_join(., select(threeway.shared.samples, top_muga, sampleid), by =c("sample"="top_muga")) %>%
  column_to_rownames("sample") %>%
  select(-sampleid) %>%
  as.matrix()

# Get feature weights
all_df_shared_weights <- get_weights(all_df_shared_model_filtered, as.data.frame = TRUE, scale = TRUE)

# load in permutation results
load(here("../pQTL_website/_data/MOFA_all_df_thres_rankZ.RData"))

# load in combined lola results
load( here("../pQTL_website/_data/","MOFA_Factor_LOLA_results.RData")) # lola_combined

# saving
save(
  all_df_shared,
  all_df_shared_model,
  #all_df_shared_model_filtered,
  shared_samples_metadata,
  #all_df_shared_factors,
  #all_df_shared_weights,
  #all_df_shared_factors_mat,
  shared.probs,
  shared.kinship,
  shared.covar,
  gmap, pmap, map_dat2,
  all_df_shared_thres,
  lola_combined,
  file = here("_figshare","DO_mESC_MOFA.RData")
)

```


<br> <br>

### Figure 1A: Overview of data sets

```{r Figure1A, fig.cap = "Figure 1A: Nearly 200 embryonic stem cell lines were established from blastocysts of Diversity Outbred mice, and quantified using ATAC-seq, RNA-seq (Skelly et al., 2020), and multiplexed mass spectrometry; 163 lines have all three measures."}

knitr::include_graphics(here("Figure1A.png"))

```

<br>

#### Table S1.Quantitative proteomic analysis of DO mESCs.

Table containing normalized and filtered protein abundances (n = 7,432) across individual DO mESClines (n = 190) and DO mESC line details with experimental covariates sex and genotype at the *Lifr* locus.



```{r TableS1_sheet1, echo = FALSE}

# expr.esc_prot %>%
#   as_tibble( rownames = "sampleid") %>%
#   pivot_longer( cols = 2:7433, names_to = "protein_id", values_to = "protein_abundance") %>%
#   left_join( all.prots) %>%
#   select( sampleid, mgi_symbol, protein_id, protein_abundance ) %>%
#   pivot_wider( id_cols = c("protein_id","mgi_symbol"),
#                names_from = "sampleid",
#                values_from = "protein_abundance") %>%
#   #mutate_if( is.numeric, formatC, format = "fg", digits =2 ) %>%
#   rename( `Protein ID` = protein_id,
#           `MGI Symbol` = mgi_symbol) -> protein_abundance
# 
# 
# covarTidy.esc_prot %>%
#   left_join(covar.lifr, by = c("top_muga"="rowname")) %>%
#   select(sampleid, sex, lifr_geno) %>%
#   mutate( sex = ifelse( sex == "M", "Male", "Female")) %>%
#   rename( `Genotype at Lifr` = lifr_geno,
#           `Sample ID` = sampleid,
#           Sex = sex) -> sample_details
# 
# writexl::write_xlsx( list( Protein_abundance = protein_abundance,
#                              Sample_details = sample_details),
#                     path = here("TableS1_DO_mESC_Proteomes.xlsx"),
#                     col_names = TRUE,
#                     format_headers = TRUE
#                     )


# xfun::embed_file(here("Table_S1.xlsx"))

download_file(
  path =  here("Table_S1.xlsx"),
  output_name = "Table_S1",
  button_label = "Download Table_S1.xlsx",
  button_type = "primary",
  has_icon = TRUE,
  icon = "fa fa-save",
  self_contained = FALSE
)

```

<br>

Annotations for all proteins in our data set can be downloaded using the link below.

```{r Protein_annotations, fig.cap = "Annotations for all proteins in our data set."}

list(all.prots) %>% 
  downloadthis::download_this(
    output_name = "Protein annotations",
    output_extension = ".xlsx",
    button_label = "Download protein annotations as xlsx",
    button_type = "primary",
    has_icon = TRUE,
    icon = "fa fa-save"
  )

```

<br> 
<br>

### Figure 1B: Detection bias in proteomics

```{r Figure1B_prep}

# get the list of protein coding genes with transcript abundance in DO mESCs
all.prot_coding.genes <-  all.genes %>% 
  filter( gene_biotype =="protein_coding")

# get average transcript abundance for all protein coding genes
mrna <- expr.esc_rna %>%
  as_tibble( rownames = "sampleid") %>%
  select( all.prot_coding.genes$ensembl_gene_id) %>% 
  summarize_all(.funs = mean, na.rm = T) %>% 
  pivot_longer( all.prot_coding.genes$ensembl_gene_id, 
                names_to = "ensembl_gene_id", 
                values_to = "avg_rna")

# annotate if the gene is found in the protein data by setting prot = TRUE/FALSE
# add average transcript abundance 
all.prot_coding.genes%>% 
  select(ensembl_gene_id, mgi_symbol,ensembl_gene_id) %>%
  mutate(prot = ifelse( (ensembl_gene_id %in% all.prots$ensembl_gene_id |
                           mgi_symbol %in% all.prots$mgi_symbol), TRUE, FALSE)) %>%
  left_join(., mrna) -> prop.data


  
  
# calculate the probability of a protein being present given its mRNA abundance
get_props <- function(dat) {
  prop.table <- c()
  for (i in c(1, 5, 10, 25, 50, 75, 100, 150, 200, 250, 300, 400, 500, 600, 700, 800, 800, 1000, 2000, 5000, 10000, 50000, 1e05)) {
    prop <- dat %>%
      mutate(mRNA = ifelse(avg_rna > i, "at least", "less than")) %>%
      count(mRNA, prot) %>%
      spread(prot, n) %>%
      mutate( `FALSE` = ifelse( is.na(`FALSE`), 0, `FALSE`),
              `TRUE` = ifelse( is.na(`TRUE`), 0, `TRUE`)) %>% # replace NAs in case there aren't any detected/undetected proteins
      filter( (`TRUE`+`FALSE` > 5)) %>% # filter to make sure each bin has at least 5 genes. 
      mutate(detection_probability = `TRUE` / (`FALSE` + `TRUE`))
    prop.temp <- prop %>%
      filter(mRNA == "at least") %>%
      select(detection_probability) %>%
      mutate(avg_rna = i)
    prop.table <- rbind(prop.table, prop.temp)
  }
  return(prop.table)
}

Figure1b_data <- get_props(prop.data) %>% 
  rename( `Probability of protein detection` = detection_probability,
          `Average transcript abundance` = avg_rna)
```


```{r Figure1B_plot, fig.height=4, fig.width=5, fig.cap="Figure 1B: Protein detection rate is linked to transcript abundance. The probability of a gene to have protein abundance measurement given its average transcript abundance among 174 mESCs with both transcriptome and proteome data."}

prob_plot <- Figure1b_data %>% 
  ggplot() +
  aes(x = `Average transcript abundance`, 
      y = `Probability of protein detection`) +
  geom_point(size = 4) +
  scale_x_log10( expand= expansion( mult = c(.1, .12))) +
  theme_pubclean(base_size = 12) +
  geom_line() +
  ylab("Probability of protein detection") +
  xlab("Average transcript abundance") +
  ylim(0.5, .95)+
  theme(
    axis.text = element_text( size = 12),
    axis.title = element_text( size = 12)
  )

ggarrange(prob_plot, 
          labels = "B", 
          font.label = list( size = 20))

```

<br>

```{r Figure1B_data, fig.cap="Data used in Figure 1B."}

Figure1b_data %>% 
  mutate_if( is.numeric, round ,2) %>% 
  create_dt()

```

<br>

#### Table S2. Over-representation analysis of detected and undetected proteins. 

Over-represented biological processes and pathways in proteins detected in all samples, and protein coding genes with transcript abundance lacking protein abundance (undetected proteins) are listed. Details include the data source, term id, term name, term size, the number of intersecting proteins and the FDR for each term identified to be overrepresented in detected or undetected protein coding genes.

```{r TableS2_generate, cache = TRUE, warning=FALSE, message=FALSE}

# get proteins expressed in all samples
expr.esc_prot.comp <- expr.esc_prot %>%
  as.data.frame() %>%
  select( all_of(all.prots$protein_id))  %>% 
  select_if(~ !any(is.na(.)))

# get the gene/protein details for proteins measured in ALL the samples
prots.detected <- all.prots %>% 
  filter(protein_id %in% colnames(expr.esc_prot.comp),
         gene_biotype== "protein_coding")

# over-representation analysis for proteins detected in all samples vs all proteins
g.prot.detected <- gost(
  query = prots.detected$mgi_symbol,
  organism = "mmusculus",
  domain_scope = "custom",
  custom_bg = all.prots$mgi_symbol,
  evcodes = TRUE,
  correction_method = "fdr"
)
g.prot.detected$result <- g.prot.detected$result %>% 
  filter(term_size < 600)

g.prot.detected$result %>% 
  select(
    `Data source` = source,
    `Term ID` = term_id,
    `Term Name` = term_name, 
    `Term size` = term_size, 
    `# of intersecting proteins` = intersection_size,
     FDR = p_value
    ) -> tables2_sheet1

# get proteins not detected although RNA is detected
all.genes %>% 
  filter( !ensembl_gene_id %in% all.prots$ensembl_gene_id & # not in protein data
            gene_biotype == "protein_coding") -> not.detected 

all.prot_coding.genes <-  all.genes %>% 
  filter( gene_biotype =="protein_coding")

g.prot.not.detected <- gost(
  query = not.detected$mgi_symbol,
  organism = "mmusculus",
  domain_scope = "custom",
  custom_bg = all.prot_coding.genes$mgi_symbol, 
  evcodes = TRUE,
  correction_method = "fdr"
)
g.prot.not.detected$result <- g.prot.not.detected$result %>% filter(term_size < 600)

g.prot.not.detected$result %>% 
  select(
    `Data source` = source,
    `Term ID` = term_id,
    `Term Name` = term_name, 
    `Term size` = term_size, 
    `# of intersecting proteins` = intersection_size,
     FDR = p_value
    ) -> tables2_sheet2
  
```

<br>

```{r TableS2_display, echo = FALSE}


# writexl::write_xlsx( list( Detected_protein_coding_genes = tables2_sheet1,
#                              Undetected_protein_coding_genes = tables2_sheet2),
#                     path = here("TableS2_ORA_protein_detection.xlsx"),
#                     col_names = TRUE,
#                     format_headers = TRUE
#                     )


# xfun::embed_file(here("Table_S2.xlsx"))

download_file(
  path = here("Table_S2.xlsx"),
  output_name = "Table_S2",
  button_label = "Download Table_S2.xlsx",
  button_type = "primary",
  has_icon = TRUE,
  icon = "fa fa-save",
  self_contained = FALSE
)

```


<br>

#### Figure S1A-C: Transcription factors show lower transcript and protein abundance than other genes

```{r FigureS1A_C, fig.cap="FigureS1: (A) Genes where protein abundance is detected have a significantly higher mean transcript abundance (One way ANOVA, followed by t-test). Average transcript abundance of protein coding genes (n = 12,732) that are detected (TRUE, n = 7,240) and not detected (FALSE, n = 5,492) in the proteomics data are plotted. (B, C) TFs show a significantly lower mean for both transcript and protein abundance in comparison to other genes (One way ANOVA, followed by t-test). Average transcript and protein abundance of protein coding genes that are transcription factors (TF) and not transcription factors (Not a TF) are plotted.", fig.width=12, fig.height=5}


rna_detect_plot <- apply(na.omit(expr.esc_rna), 2, mean) %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  left_join(., all.genes, by = c("rowname" = "ensembl_gene_id")) %>%
  filter(gene_biotype %in% c("protein_coding")) %>%
  mutate(detected = ifelse(rowname %in% all.prots$ensembl_gene_id, "Detected", "Not detected")) %>%
  ggplot() +
  aes(x = detected, y = `.`) +
  geom_boxplot(width = 0.2) +
  ylab("Average transcript abundance") +
  scale_y_log10( expand = expansion(mult =c(0.4,0.2))) +
  theme_pubclean(base_size = 18) +
  #ggtitle("Protein coding genes") +
  xlab("") +
  stat_compare_means(method = "anova",label.y=7.5)+
  stat_compare_means(method = "t.test",label.y=6.5)
  

tf_rna_plot <- apply(na.omit(expr.esc_rna), 2, mean) %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  left_join(., all.genes, by = c("rowname" = "ensembl_gene_id")) %>%
    filter(gene_biotype %in% c("protein_coding")) %>%
  mutate(is_tf = ifelse( mgi_symbol %in% all.tfs$mgi_symbol, "TF", "Not a TF")) %>%
  ggplot() +
  aes(x = is_tf, y = `.`) +
  geom_boxplot(width = 0.2) +
  ylab("Average transcript abundance") +
  scale_y_log10( expand = expansion(mult =c(0.4,0.2))) +
  theme_pubclean(base_size = 18) +
  #ggtitle("Protein coding genes") +
  xlab("") +
  stat_compare_means(method = "anova",label.y=7.5)+
  stat_compare_means(method = "t.test",label.y=6.5)

tf_prot_plot <- apply((expr.esc_prot[,all.prots$protein_id]), 2, mean, na.rm=TRUE) %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  left_join( ., all.prots, by = c("rowname" = "protein_id")) %>% 
  mutate(is_tf = ifelse( mgi_symbol %in% all.tfs$mgi_symbol, "TF", "Not a TF")) %>%
  ggplot() +
  aes(x = is_tf, y = `.`) +
  geom_boxplot(width = 0.2) +
  ylab("Average protein abundance") +
  theme_pubclean(base_size = 18) +
  #ggtitle("Protein coding genes") +
  xlab("") +
  scale_y_continuous( expand =expansion(mult =c(0.3,0.2))) +
  stat_compare_means(method = "anova", label.y=17)+
  stat_compare_means(method = "t.test",label.y=15.5)



detection_plot <- ggarrange(rna_detect_plot, 
                            tf_rna_plot,tf_prot_plot, 
                            nrow = 1, 
                            widths = c(0.7,0.7,0.7), 
                            labels = c("A","B","C"),
                            font.label = list( size = 20))

detection_plot

```

<br> <br>

### Figure 1C: Principal component analysis of the pluripotent proteome

```{r Figure1C_prep}

# principal component analysis using pcaMethods::pca function.
pca.prot <- pca(object = expr.esc_prot, 
                            method = "svdImpute", 
                            scale = "uv", 
                            nPcs = 10, 
                            center = TRUE
                            )

# get values for Principal components
Figure1c_data <- scores(pca.prot) %>%
  as_tibble( rownames ="sampleid") %>% 
  left_join(covarTidy.esc_prot) %>%
  mutate(sex = ifelse(sex == "F", "Female", "Male")) %>%
  rename(
    `Sample ID` = sampleid, 
    Sex = sex
  ) 

```

```{r Figure1C_plot, fig.cap="Figure 1C: Principal component analysis reveals sex as a significant source of variation among DO mESC proteomes. PC1 and PC2 for 190 mESCs are plotted and colored by sex.", fig.width=8, fig.height=4}


Figure1c_data %>% 
  ggplot() +
  aes(x = PC1, y = PC2, col = Sex) +
  geom_point(size = 4, alpha = 0.7) +
  theme(
    axis.text = element_text(size = 18),
    axis.title = element_text(size = 20), 
    legend.text = element_text(size = 16), legend.title = element_text(size = 16)
  ) +
  xlab(paste0("PC1 (", round(pca.prot@R2[1], 3) * 100, "%)")) +
  ylab(paste0("PC2 (", round(pca.prot@R2[2], 3) * 100, "%)")) +
  theme_pubclean(base_size = 18) + 
  color_palette("npg")+
  fill_palette("npg")+
  xlim(-100,150)+
  ylim(-100,150)+
  facet_wrap(~Sex, strip.position = "right")+
  theme(
    strip.background = element_blank(),
    strip.text.x = element_blank()
  ) -> pca_plot

ggarrange(pca_plot, 
          labels = "C",
          font.label = list( size = 20))



```

<br>

The data used in plotting Figure1C can be downloaded using the link below.

```{r Figure1C_data, fig.cap="Data used in plotting Figure 1C."}

list( Figure1c_data %>% 
        select(`Sample ID`, Sex, PC1, PC2)) %>% 
  downloadthis::download_this(
    output_name = "Figure1C data",
    output_extension = ".xlsx",
    button_label = "Download Figure1C data as xlsx",
    button_type = "primary",
    has_icon = TRUE,
    icon = "fa fa-save"
  )

```

<br>

#### Drivers of PC1

```{r PC1_drivers, fig.width=10, fig.height=5, fig.cap="Chromosomal locations of the top 10% of proteins that contribute to PC1."}

# get top drivers of PC1
loadings(pca.prot) %>%
  as_tibble( rownames = "protein_id") %>% 
  left_join( all.prots) %>% 
  select(mgi_symbol, gene_chr, PC1) %>%
  filter( abs(PC1) >= quantile(abs(PC1), 0.90))-> pc1.loadings # most contributing 10%

# PC1 drivers by chromosome

pc1.loadings %>% 
  group_by(gene_chr) %>%
  # count() %>%
  # arrange(desc(n)) %>%
  ggplot()+
  aes( x = gene_chr)+
  geom_bar()+
  theme_pubclean( base_size = 18)+
  xlab("Chr")
  
```

<br>

```{r PC1_drivers_ORA, fig.cap="Over-represented biological processes and pathways in PC1 drivers.", cache = TRUE}



g.pc1 <- gost(query = pc1.loadings$mgi_symbol, 
              organism = "mmusculus", 
              domain_scope = "custom", 
              custom_bg = all.prots$mgi_symbol, 
              evcodes = TRUE,
              correction_method = "fdr")
g.pc1$result <- g.pc1$result %>% filter(term_size < 600)

g.pc1$result %>% 
   select(
    `Data source` = source,
    `Term ID` = term_id,
    `Term Name` = term_name, 
    `Term size` = term_size, 
    `# of intersecting proteins` = intersection_size,
     FDR = p_value
    ) %>%
  mutate_if( is.numeric, formatC, digits =2) %>% 
  create_dt()

```

<br>

#### Figure S1D-E: Variation in protein abundance

```{r FigureS1D_E, fig.cap="(D, E) Protein abundance is highly variable across DO mESCs. Histograms showing the mean abundance and variance per protein plotted for 7,342 proteins across 190 DO mESC lines.", fig.width=8, fig.height=4}

expr.esc_prot %>%
  as_tibble(.) %>%
  summarise_all(list(~ mean(., na.rm = T))) %>% 
  pivot_longer( all.prots$protein_id,
                names_to = "protein_id",
                values_to ="mean.prot") %>% 
  ggplot() +
  aes(x = mean.prot) +
  geom_histogram(binwidth = 0.1) +
  xlab("Mean") +
  theme_pubclean(base_size = 18) -> p.mean.hist

expr.esc_prot %>%
  as_tibble(.) %>%
  summarise_all(list(~ var(., na.rm = T))) %>% 
  pivot_longer( all.prots$protein_id,
                names_to = "protein_id",
                values_to ="var") %>% 
  ggplot() +
  aes(x = var) +
  geom_histogram(binwidth = 0.01) +
  xlab("Variance") +
  theme_pubclean(base_size = 18) +
  scale_x_log10() -> p.var.hist

ggarrange(p.mean.hist, 
          p.var.hist, 
          nrow =1, 
          labels = c("D","E"),
          font.label = list( size = 18))

```

<br>

#### Sex effects on protein abundance

```{r sex_eff, cache = TRUE, results='hide'}

# updating the code to use anova followed by tukey's hsd:
expr.esc_prot %>%
  t() %>% 
  as_tibble(rownames = "protein_id") %>%
  filter( protein_id %in% all.prots$protein_id) %>% 
  pivot_longer( cols = rownames(expr.esc_prot),
                values_to = "protein_ab",
                names_to = "sampleid") %>% 
  left_join(., select(covarTidy.esc_prot, sampleid, sex)) %>% 
  group_by(protein_id) %>% 
  rstatix::anova_test( protein_ab ~ sex) %>% 
  rstatix::adjust_pvalue( method = "BH") %>% 
  rstatix::add_significance("p.adj") %>% 
  as_tibble() -> prot_sex_aov

# passing the full data to tukey's then filtering
expr.esc_prot %>%
  t() %>% 
  as_tibble(rownames = "protein_id") %>% 
  filter( protein_id %in% all.prots$protein_id) %>% 
  pivot_longer( cols = rownames(expr.esc_prot),
                values_to = "protein_ab",
                names_to = "sampleid") %>% 
  left_join(., select(covarTidy.esc_prot, sampleid, sex)) %>% 
  group_by(protein_id) %>% 
  rstatix::tukey_hsd(protein_ab ~ sex) %>% 
  filter( protein_id %in% (filter(prot_sex_aov, p.adj.signif != "ns"))$protein_id ) -> prot_sex_tukeys


# get the medians for later
expr.esc_prot %>%
  t() %>% 
  as_tibble(rownames = "protein_id") %>%
  filter( protein_id %in% all.prots$protein_id) %>% 
  pivot_longer( cols = rownames(expr.esc_prot),
                values_to = "protein_ab",
                names_to = "sampleid") %>% 
  left_join(., select(covarTidy.esc_prot, sampleid, sex)) %>% 
  group_by(protein_id,sex) %>% 
  summarize( med = median(protein_ab, na.rm =T)) %>% 
  pivot_wider( id_cols = "protein_id",
               names_from = "sex",
               values_from = "med")-> prot_sex_med
```

```{r Sex_eff_table, fig.cap="Table of proteins showing a significant sex effect."}

prot_sex_tukeys %>%
  left_join( all.prots) %>% 
  left_join( prot_sex_med) %>% 
  arrange(p.adj) %>%
  mutate_if( is.numeric, round, 2) %>%
  select(
    `Protein ID` = protein_id,
    `MGI Symbol`= mgi_symbol, 
    `Protein location (chr)` = gene_chr,
    `Female median`=`F`,
    `Male median`= M
   ) %>%
  create_dt()

```

<br> 
<br>

### Figure 1D: Geneset variation analysis of the pluripotent proteome

```{r run_gsva_protein_and_rna, cache = TRUE, results='hide', warning=FALSE, message=FALSE}

library(GSVA)
library(GO.db)

# Preparing gene sets from GO
# reading in the GO + mgi downloaded from: http://www.informatics.jax.org/gotools/data/input/MGIgenes_by_GOid.txt
go_terms <- read_tsv( "http://www.informatics.jax.org/gotools/data/input/MGIgenes_by_GOid.txt") %>% 
  mutate( genes = str_split(genes, ",")) %>% 
  unnest() # separete the symbols, note the overlap: length(intersect(unique(go_terms$genes), all.prots$mgi_symbol) ) = 6757

slim_go_terms <- read_tsv( "http://www.informatics.jax.org/gotools/data/input/map2MGIslim.txt") %>% 
  select(-term) %>% 
  mutate( ONT = case_when( aspect == "P" ~  "BP",
                     aspect == "F" ~ "MF",
                     aspect == "C" ~ "CC"
                     )
          ) %>% 
  select(-aspect)

genesbygo <- split(go_terms$genes, go_terms$GO_id)

go_terms_annot <- go_terms %>%  
  select(GO_id) %>% 
  distinct() %>% 
  left_join( slim_go_terms %>%  select( GO_id, ONT) %>% distinct())

goannot_wdef <- AnnotationDbi::select(GO.db, keys= unique(go_terms$GO_id), columns=c("GOID","DEFINITION","ONTOLOGY","TERM")) %>%
  left_join( slim_go_terms, by=c("GOID"="GO_id")) %>% 
  mutate( ONTOLOGY = ONT) %>% 
  select(-ONT)

go_bp <- goannot_wdef %>% filter( ONTOLOGY == "BP") %>% 
  select(GOID) %>%  distinct()

# Run GSVA using protein abundance
expr.esc_prot_upd <- expr.esc_prot[, all.prots$protein_id]
colnames(expr.esc_prot_upd) <- all.prots$mgi_symbol
gsva_prot <- gsva(  expr = t(expr.esc_prot_upd),
                    genesbygo,
                    method ="gsva",
                    kcdf = "none",
                    min.sz = 5, 
                    max.sz = 1000,
                    mx.diff = TRUE)

# Run GSVA with complexes to complement the co-abundance analysis
genes_for_complex_gsva <- complex.genes %>%  
  enframe( "Complex Name","human_ids") %>%
  unnest(human_ids) %>% 
  left_join( complex.gene.list) %>% 
  filter( !is.na(protein_id)) %>% 
  select( `Complex Name` , protein_id)
gsva_complexes <- unique(genes_for_complex_gsva$`Complex Name`)
genes_by_complex <- split(genes_for_complex_gsva$protein_id, genes_for_complex_gsva$`Complex Name`)
gsva_prot_comp <-  gsva(  expr = t(expr.esc_prot[,all.prots$protein_id]),
                    genes_by_complex,
                    method ="gsva",
                    kcdf = "none",
                    min.sz = 5, 
                    max.sz = 1000,
                    mx.diff = TRUE)

# Run GSVA using transcript abundance
expr.esc_rna_upd <- expr.esc_rna[, all.genes$ensembl_gene_id]
colnames(expr.esc_rna_upd) <- all.genes$mgi_symbol
gsva_rna <- gsva(  expr = t(expr.esc_rna_upd),
                    genesbygo,
                    method ="gsva",
                    kcdf = "none",
                    min.sz = 5, 
                    max.sz = 1000,
                    mx.diff = TRUE)

# Annotation and statistical follow up using ANOVA + Tukey's on Protein results
covar.lifr  %>% 
  rename( top_muga = rowname ) %>% 
  left_join(sample.matches.all) %>% 
  select(sampleid, lifr_geno) %>% 
  inner_join(covarTidy.esc_prot) -> covar_lifr_upd

gsva_prot %>% 
  as_tibble(rownames = "Category") %>% 
  filter( Category %in% go_bp$GOID) %>% #filtering for BP
  rbind( as_tibble(gsva_prot_comp, rownames = "Category" )) %>% 
  # rbind( as_tibble(gsva_prot3, rownames = "Category" )) %>% 
  # rbind( as_tibble(gsva_prot4, rownames = "Category")) %>% 
  pivot_longer( cols = rownames(expr.esc_prot_upd),
                values_to = "Enrichment_Score",
                names_to = "sampleid") %>% 
  # add sexes + lifr genotypes
  left_join( covar_lifr_upd) -> gsva_results


gsva_results %>% 
  group_by( Category) %>% 
  rstatix::anova_test( Enrichment_Score ~ sex+lifr_geno+sex*lifr_geno) %>% 
  rstatix::adjust_pvalue(method = "BH") %>%
  rstatix::add_significance("p.adj") %>% 
  ungroup() -> aov_results 

aov_results %>% 
  as_tibble() %>% 
  filter( p.adj.signif != "ns" ) -> signif_eff_terms

# passing all to Tukey's with the full model
gsva_results %>% 
  group_by(Category) %>% 
  rstatix::tukey_hsd( Enrichment_Score ~ sex+lifr_geno+sex:lifr_geno) %>% 
  ungroup() %>% 
  as_tibble() %>% 
  left_join( goannot_wdef, by = c("Category" = "GOID")) -> results_tukey

# filtering for the ones that were significant in ANOVA + Tukey's
results_tukey %>% 
  filter( p.adj < 0.05) %>% 
  inner_join( ., select( signif_eff_terms, Category, term = Effect)) -> signif_results_tukey

# Annotation and statistical follow up using ANOVA + Tukey's on RNA results
gsva_rna %>% 
  as_tibble(rownames = "Category") %>% 
  filter( Category %in% go_bp$GOID) %>% #filtering for BP
  pivot_longer( cols = rownames(expr.esc_rna_upd),
                values_to = "Enrichment_Score",
                names_to = "sampleid") %>% 
  # add sexes + lifr genotypes
  left_join( covar_lifr_upd) -> gsva_rna_results

gsva_rna_results %>% 
  group_by( Category) %>% 
  rstatix::anova_test( Enrichment_Score ~ sex+lifr_geno+sex*lifr_geno) %>% 
  rstatix::adjust_pvalue( method = "BH") %>%
  rstatix::add_significance("p.adj") %>% 
  ungroup() -> gsva_rna_aov

gsva_rna_results %>% 
  group_by(Category) %>% 
  rstatix::tukey_hsd( Enrichment_Score ~ sex+lifr_geno+sex:lifr_geno) %>% 
  ungroup() %>% 
  as_tibble() %>% 
  left_join( goannot_wdef, by = c("Category" = "GOID")) -> gsva_rna_tukey

gsva_rna_aov %>% 
  as_tibble() %>% 
  filter( p.adj.signif != "ns" ) -> signif_eff_terms_rna

gsva_rna_tukey %>% 
  filter( p.adj < 0.05) %>% 
  inner_join( ., select( signif_eff_terms_rna, Category, term = Effect)) -> signif_results_tukey_rna

```


```{r Figure1D_prep}

Figure1d_data <- gsva_results %>%
  filter( Category %in% c("GO:0006306", # DNA methylation
                          "GO:0006338", # Chromatin remodeling
                          "GO:0042254" # Ribosome biogenesis
                          )) %>%
  left_join( select(goannot_wdef, Category = GOID, TERM)) %>% 
  select( Category, TERM, sampleid, Enrichment_Score, sex, lifr_geno) %>% 
  left_join(
    signif_results_tukey %>% 
      filter( term =="sex") %>% 
      select(Category, p.adj, p.adj.signif )
  ) %>% 
  rbind(
       gsva_results %>%  
         filter( Category == "GO:0006471") %>%  # protein ADP-ribosylation
         left_join( select(goannot_wdef, Category = GOID, TERM)) %>% 
         select( Category, TERM, sampleid, Enrichment_Score, sex, lifr_geno) %>% 
         left_join(
           signif_results_tukey %>% 
             filter( term =="lifr_geno") %>% 
             select(Category, p.adj, p.adj.signif )
           )
       ) %>% 
  mutate( Enrichment_Score = round(Enrichment_Score, 2)) %>% 
  select( `GO ID` = Category, 
          `GO Term`=TERM, 
          Sample = sampleid, 
          `Enrichment Score`=Enrichment_Score, 
          Sex = sex, 
          `Lifr genotype`=lifr_geno, 
          `Significance`=p.adj.signif)

```


```{r Figure1D_plot, fig.width=16, fig.height=4, fig.cap="Figure 1D: Enrichment scores obtained from GSVA for Gene Ontology Biological Processes (GO:BP) showing significant differences between mESCs with different sexes and genotypes at the Lifr locus are plotted. GO:BP DNA methylation, chromatin remodeling and ribosome biogenesis show significantly higher enrichment in males in comparison to females and, protein ADP-ribosylation shows significantly higher enrichment in mESCs with at least one copy of the reference allele in comparison to ones carrying two copies of the alternative allele at the *Lifr* locus (two-way anova followed by Tukey’s HSD, `*: p value < 0.05, ****: p value < 0.00005`)."}


Figure1d_data %>% 
  ggplot()+
  aes( x = Sex,
       y = `Enrichment Score`,
       col = Sex)+
  geom_boxplot(width =0.2, size = 1.1)+
  #geom_jitter()+
  #geom_beeswarm(aes(col = sex))+
  theme_pubclean(base_size = 16)+
  stat_pvalue_manual( filter(signif_results_tukey,Category  == "GO:0006306", term == "sex"),
                      label = "{p.adj.signif}",
                      y.position = 0.85)+
  color_palette("npg")+
  ylab("Enrichment Score")+
  ggtitle("DNA Methylation")+
  xlab("")+
  ylim(-1,1)+
  theme(axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18),
        axis.title = element_text(size =18),
        plot.title = element_text(size =18, hjust = 0.5)) -> p_met

Figure1d_data %>% 
  ggplot()+
  aes( x = Sex,
       y = `Enrichment Score`,
       col = Sex)+
  geom_boxplot(width =0.2, size = 1.1)+
  #geom_jitter()+
  #geom_beeswarm(aes(col = sex))+
  theme_pubclean(base_size = 16)+
  stat_pvalue_manual( filter(signif_results_tukey,Category  == "GO:0006338", term == "sex"),
                      label = "{p.adj.signif}",
                      y.position = 0.85)+
  color_palette("npg")+
  ylab("Enrichment Score")+
  ggtitle("Chromatin remodeling")+
  xlab("")+
  ylim(-1,1)+
  theme(axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18),
        axis.title = element_text(size =18),
        plot.title = element_text(size =18, hjust = .5)) -> p_chrom

Figure1d_data %>% 
  ggplot()+
  aes( x = Sex,
       y = `Enrichment Score`,
       col = Sex)+
  geom_boxplot(width =0.2, size = 1.1)+
  #geom_jitter()+
  #geom_beeswarm(aes(col = sex))+
  theme_pubclean(base_size = 16)+
  stat_pvalue_manual( filter(signif_results_tukey,Category  == "GO:0042254", term == "sex"),
                      label = "{p.adj.signif}",
                      y.position = 0.85)+
  color_palette("npg")+
  ylab("Enrichment Score")+
  ggtitle("Ribosome biogenesis")+
  xlab("")+
  ylim(-1,1)+
  theme(axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18),
        axis.title = element_text(size =18),
        plot.title = element_text(size =18, hjust = .5)) -> p_ribo

gsva_results %>%
  filter( Category ==  "GO:0006471") %>%
  left_join( select(goannot_wdef, Category = GOID, TERM)) %>% 
  rename( `GO ID` = Category, 
          `GO Term`=TERM, 
          Sample = sampleid, 
          `Enrichment Score`=Enrichment_Score, 
          Sex = sex, 
          `Lifr genotype`=lifr_geno) %>% 
  ggplot()+
  aes( x = `Lifr genotype`,
       y =  `Enrichment Score`,
       col = `Lifr genotype`)+
  geom_boxplot(width =0.2, size = 1.1)+
  #geom_jitter()+
  #geom_beeswarm(aes(col = sex))+
  theme_pubclean(base_size = 16)+
  stat_pvalue_manual( filter(signif_results_tukey,Category  == "GO:0006471", term == "lifr_geno"),
                      label = "{p.adj.signif}",
                      y.position = c(0.85, 0.95))+
  color_palette("Dark2")+
  ylab("Enrichment Score")+
  ggtitle("Protein ADP-ribosylation")+
  xlab("")+
  labs(col ="LIFR")+
  ylim(-1,1)+
  theme(legend.position = "none",
        axis.text.x = element_text(size = 18),
        axis.text.y = element_text(size = 18),
        axis.title = element_text(size =18),
        plot.title = element_text(size =18, hjust = .5)) -> p_adp

left <- ggarrange(p_met,p_chrom, p_ribo, 
                  common.legend = TRUE, 
                  nrow = 1,
                  ncol = 3, 
                  legend = "none")

ggarrange(left, p_adp, 
          nrow=1, 
          widths = c(.8,0.4) , 
          labels = c("D"), 
          font.label = list( size = 20))


```

<br>

The data used in plotting Figure1D can be downloaded using the link below.

```{r Figure1D_data, fig.cap="Data used to generate Figure 1D."}

list(Figure1d_data) %>% 
  downloadthis::download_this(
    output_name = "Figure1D data",
    output_extension = ".xlsx",
    button_label = "Download Figure1D data as xlsx",
    button_type = "primary",
    has_icon = TRUE,
    icon = "fa fa-save"
  )

```

<br>

#### Table S3: Gene Set Variation Analysis results. 

Biological processes and protein complexes that show significant differences between experimental groups (sex, *Lifr* genotype, or their interaction) in GSVA enrichment scores obtained from protein or transcript abundance are listed. The source of the significant effect (sex, *Lifr* genotype or their interaction) as well as the two groups being compared is included with the Tukey's HSD estimate and the adjusted p-value for each term. 


```{r TableS3_generate, warning=FALSE, message=FALSE}

signif_results_tukey %>% 
  #left_join( ., goannot_wdef %>%  select(TERM, GOID) %>% distinct()) %>% 
  select(Effect= term, Category, TERM,group1, group2, estimate,p.adj) %>% 
  distinct() %>% 
  mutate( estimate= round(estimate,2),
          TERM = ifelse( Category %in% genes_for_complex_gsva$`Complex Name`, "Protein Complex", TERM)) %>% 
  mutate( 
    temp = TERM,
    TERM = ifelse( TERM =="Protein Complex" & !is.na(TERM), Category, TERM ),
    Category = ifelse( temp == "Protein Complex" & !is.na(TERM), temp, Category)
          ) %>%  
  arrange(estimate) %>% 
  select( 
    Effect, 
    `Term ID` = Category,
    `Term Name` = TERM,
    `Group 1`= group1,
    `Group 2`= group2,
    `Tukey's HSD estimate` = estimate, 
    `Adjusted p-value` = p.adj
    ) -> tables3_sheet1


signif_results_tukey_rna %>%
  #left_join( ., goannot_wdef %>%  select(TERM, GOID) %>% distinct()) %>% 
  select(Effect= term, Category, TERM,group1, group2, estimate,p.adj) %>% 
  distinct() %>% 
  mutate( estimate= round(estimate,2),
          TERM = ifelse( Category %in% genes_for_complex_gsva$`Complex Name`, "Protein Complex", TERM)) %>% 
  mutate( 
    temp = TERM,
    TERM = ifelse( TERM =="Protein Complex" & !is.na(TERM), Category, TERM ),
    Category = ifelse( temp == "Protein Complex" & !is.na(TERM), temp, Category)
          ) %>% 
  arrange(estimate) %>% 
  select( 
    Effect, 
    `Term ID` = Category,
    `Term Name` = TERM,
    `Group 1`= group1,
    `Group 2`= group2,
    `Tukey's HSD estimate` = estimate, 
    `Adjusted p-value` = p.adj
    ) -> tables3_sheet2

  
```


```{r TableS3_display, echo=FALSE}

# 
# writexl::write_xlsx( list( GSVA_protein = tables3_sheet1,
#                              GSVA_transcript = tables3_sheet2),
#                     path = here("TableS3_GSVA_results.xlsx"),
#                     col_names = TRUE,
#                     format_headers = TRUE
#                     )


# xfun::embed_file(here("Table_S3.xlsx"))

download_file(
  path = here("Table_S3.xlsx"),
  output_name = "Table_S3",
  button_label = "Download Table_S3.xlsx",
  button_type = "primary",
  has_icon = TRUE,
  icon = "fa fa-save",
  self_contained = FALSE
)

```

<br>

#### Over-representation analysis in proteins with high and low variation in abundance

```{r ORA_var_prots, fig.cap="Over-represented biological processes and pathways in proteins with high (top 5th percentile) and low variation (bottom 5th percentile).", cache = TRUE}

var_prot <- expr.esc_prot %>%
  as_tibble(.) %>%
  summarise_all(list(~ var(., na.rm = T))) %>% 
  pivot_longer( all.prots$protein_id,
                names_to = "protein_id",
                values_to ="var") 

high.var.prots <- var_prot %>%  
  filter( var >= quantile(var, 0.95)) %>% 
  left_join( all.prots %>%  
               select( protein_id, mgi_symbol))

low.var.prots <- var_prot %>% 
  filter( var <= quantile(var, 0.05))%>% 
  left_join( all.prots %>%  
               select( protein_id, mgi_symbol))

g.high.var <- gost(
  query = high.var.prots$mgi_symbol,
  organism = "mmusculus",
  domain_scope = "custom",
  custom_bg = all.prots$mgi_symbol,
  evcodes = TRUE,
  correction_method = "fdr"
)
g.high.var$result <- g.high.var$result %>% filter(term_size < 600)

g.low.var <- gost(
  query = low.var.prots$mgi_symbol,
  organism = "mmusculus",
  domain_scope = "custom",
  custom_bg = all.prots$mgi_symbol,
  evcodes = TRUE,
  correction_method = "fdr"
)
g.low.var$result <- g.low.var$result %>% filter(term_size < 600)

g.high.var$result %>% 
  mutate( `Gene Set` = "High variation") %>% 
  select(`Gene Set`, `Term Name` = term_name, source, FDR = p_value, `Term size` = term_size, Intersection = intersection_size,term_id)  %>% 
  rbind(
    g.low.var$result %>%
      mutate( `Gene Set` = "Low variation") %>% 
       select(`Gene Set`, `Term Name` = term_name, source, FDR = p_value, `Term size` = term_size, Intersection = intersection_size, term_id) 
    ) %>% 
  select(
         `Gene Set`, 
    `Data source` = source,
    `Term ID` = term_id,
    `Term Name` , 
    `Term size` , 
    `# of intersecting proteins` = Intersection,
     FDR 
    ) %>% 
  mutate_if( is.numeric, formatC, digits =2) %>% 
  create_dt()

```

<br>

#### Figure S1F-G

```{r FigureS1F_G, fig.cap ="FigureS1: (F) Mean abundance and variance plotted for all proteins (gray) with proteins identified as part of `Extracellular region` and `ECM protein` GO Terms in most variable proteins (top 5th percentile %CV), in overrepresentation analysis, highlighted in blue. (G) Mean abundance and variance plotted for all proteins with proteins identified as `REX1 Target` in TRANSFAC database in least variable proteins (bottom 5th percentile %CV), in overrepresentation analysis, highlighted in blue and REX1 (Zfp42) highlighted in purple.", fig.width=10, fig.height=5}

# get proteins identified as part of "extracellular region"
ecm_genes <- tibble(mgi_symbol = unlist(str_split((g.high.var$result %>% filter(term_name %in% c("extracellular region", "extracellular matrix")))$intersection, ","))) %>%
  left_join(., all.prots)

# get proteins identified as REX1 targets 
rex1.genes <- tibble(mgi_symbol = unlist(str_split((g.low.var$result %>% filter(source == "TF"))$intersection[1], ","))) %>%
  left_join(., all.prots)

# get variance + mean by protein into a single data frame
mean_prot <- expr.esc_prot %>%
  as_tibble(.) %>%
  summarise_all(list(~ mean(., na.rm = T))) %>% 
  pivot_longer( all.prots$protein_id,
                names_to = "protein_id",
                values_to ="mean") 

var_mean_prot <- var_prot %>% 
  full_join( mean_prot) %>% 
  left_join( all.prots)
  
var_mean_prot %>% 
  ggscatter(., 
            x = "mean", 
            y = "var", 
            size = 3, 
            alpha = 0.5,
            col="gray",
            yscale = "log10",
            #xscale = "log10",
            show.legend.text = FALSE
            ) +
  xlab("Mean protein abundance") +
  ylab("Variance in protein abundance") +
  ggtitle("REX1 Target Proteins")+
  theme_pubclean(base_size = 18) + 
  rremove("legend") +
  geom_point(
    data =   filter( var_mean_prot, ensembl_gene_id %in% rex1.genes$ensembl_gene_id) ,
    col = "blue", alpha = 0.6, size = 3)+
  geom_point( data = filter(var_mean_prot, mgi_symbol == "Zfp42"), col = "purple", size = 4, alpha = 1)+
  geom_label( data = filter(var_mean_prot, mgi_symbol == "Zfp42") , label = "Rex1", nudge_x = .2, nudge_y = .2)  -> plot_rex1


var_mean_prot %>% 
  ggscatter(., 
            y = "var", 
            x = "mean", 
            size = 3, 
            alpha = 0.5,
            col="gray",
            yscale = "log10",
            #xscale = "log10",
            show.legend.text = FALSE
            ) +
  ylab("Variance in protein abundance") +
  xlab("Mean protein abundance") +
  theme_pubclean(base_size = 18) + 
  rremove("legend") +
  geom_point(
    data =   filter( var_mean_prot, ensembl_gene_id %in% c(ecm_genes$ensembl_gene_id)) ,
    col = "blue", alpha = 0.6, size = 3) +
  ggtitle("Extracellular Matrix Proteins")-> plot_ecm

ggarrange( plot_ecm, plot_rex1, nrow =1 ,
           labels = c("F","G"),
          font.label = list( size = 20)
        )


```


A work by Selcan Aydin

selcan.aydin@jax.org